EulerFS is NVDIMM filesystem. It uses soft updates and pointer-based dual views to delay synchronous cache flushes and reduce latency significantly in critical path.
We run eulerfs with xfstests, no WARNING or BUG message is found in kernel log.
We test the performance with multiple benchmark, the results is much greater than ext4dax.
We run the code coverage testing, line coverage is over 80%, functions coverage is almost 100%.
Yu Kuai (17): eulerfs: common definitions eulerfs: add kmeme_cache definitions and interfaces eulerfs: add memory allocation interfaces eulerfs: add flush interfaces eulerfs: add interfaces for inode lock transfer eulerfs: add interfaces for page wear eulerfs: add filename interfaces eulerfs: add nv dict operations eulerfs: add dependency operations eulerfs: add inode related interfaces eulerfs: add dax operations eulerfs: add file operations and inode operations for regular file eulerfs: add inode_operations for dir inode and special inode eulerfs: add file_operations for dir inode eulerfs: add inode_operations for symlink inode eulerfs: add super_operations and module_init/exit eulerfs: add Kconfig and Makefile
fs/Kconfig | 1 + fs/Makefile | 1 + fs/eulerfs/Kconfig | 10 + fs/eulerfs/Makefile | 9 + fs/eulerfs/alloc_interface.h | 113 +++ fs/eulerfs/const.h | 80 ++ fs/eulerfs/dax.c | 1696 ++++++++++++++++++++++++++++++++++ fs/eulerfs/dax.h | 101 ++ fs/eulerfs/dep.c | 791 ++++++++++++++++ fs/eulerfs/dep.h | 218 +++++ fs/eulerfs/dht.c | 312 +++++++ fs/eulerfs/dht.h | 156 ++++ fs/eulerfs/dir.c | 139 +++ fs/eulerfs/euler.h | 84 ++ fs/eulerfs/euler_common.h | 225 +++++ fs/eulerfs/euler_dbg.h | 36 + fs/eulerfs/euler_def.h | 201 ++++ fs/eulerfs/file.c | 294 ++++++ fs/eulerfs/filename.h | 120 +++ fs/eulerfs/flush.h | 171 ++++ fs/eulerfs/inode.c | 602 ++++++++++++ fs/eulerfs/inode.h | 44 + fs/eulerfs/kmem_cache.c | 107 +++ fs/eulerfs/kmem_cache.h | 37 + fs/eulerfs/lock.h | 49 + fs/eulerfs/namei.c | 872 +++++++++++++++++ fs/eulerfs/nvalloc.c | 1451 +++++++++++++++++++++++++++++ fs/eulerfs/nvalloc.h | 214 +++++ fs/eulerfs/nvm_struct.h | 297 ++++++ fs/eulerfs/pbatch.h | 314 +++++++ fs/eulerfs/super.c | 811 ++++++++++++++++ fs/eulerfs/symlink.c | 29 + fs/eulerfs/wear.c | 48 + fs/eulerfs/wear.h | 30 + 34 files changed, 9663 insertions(+) create mode 100644 fs/eulerfs/Kconfig create mode 100644 fs/eulerfs/Makefile create mode 100644 fs/eulerfs/alloc_interface.h create mode 100644 fs/eulerfs/const.h create mode 100644 fs/eulerfs/dax.c create mode 100644 fs/eulerfs/dax.h create mode 100644 fs/eulerfs/dep.c create mode 100644 fs/eulerfs/dep.h create mode 100644 fs/eulerfs/dht.c create mode 100644 fs/eulerfs/dht.h create mode 100644 fs/eulerfs/dir.c create mode 100644 fs/eulerfs/euler.h create mode 100644 fs/eulerfs/euler_common.h create mode 100644 fs/eulerfs/euler_dbg.h create mode 100644 fs/eulerfs/euler_def.h create mode 100644 fs/eulerfs/file.c create mode 100644 fs/eulerfs/filename.h create mode 100644 fs/eulerfs/flush.h create mode 100644 fs/eulerfs/inode.c create mode 100644 fs/eulerfs/inode.h create mode 100644 fs/eulerfs/kmem_cache.c create mode 100644 fs/eulerfs/kmem_cache.h create mode 100644 fs/eulerfs/lock.h create mode 100644 fs/eulerfs/namei.c create mode 100644 fs/eulerfs/nvalloc.c create mode 100644 fs/eulerfs/nvalloc.h create mode 100644 fs/eulerfs/nvm_struct.h create mode 100644 fs/eulerfs/pbatch.h create mode 100644 fs/eulerfs/super.c create mode 100644 fs/eulerfs/symlink.c create mode 100644 fs/eulerfs/wear.c create mode 100644 fs/eulerfs/wear.h
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
These interfaces will be implemented and used in later patches.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/const.h | 80 ++++++++++ fs/eulerfs/euler.h | 84 ++++++++++ fs/eulerfs/euler_common.h | 225 +++++++++++++++++++++++++++ fs/eulerfs/euler_dbg.h | 36 +++++ fs/eulerfs/euler_def.h | 201 ++++++++++++++++++++++++ fs/eulerfs/nvm_struct.h | 297 +++++++++++++++++++++++++++++++++++ fs/eulerfs/pbatch.h | 314 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 1237 insertions(+) create mode 100644 fs/eulerfs/const.h create mode 100644 fs/eulerfs/euler.h create mode 100644 fs/eulerfs/euler_common.h create mode 100644 fs/eulerfs/euler_dbg.h create mode 100644 fs/eulerfs/euler_def.h create mode 100644 fs/eulerfs/nvm_struct.h create mode 100644 fs/eulerfs/pbatch.h
diff --git a/fs/eulerfs/const.h b/fs/eulerfs/const.h new file mode 100644 index 000000000000..1e3485ecc8a0 --- /dev/null +++ b/fs/eulerfs/const.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_CONST_H +#define EUFS_CONST_H + +/* EULER */ +#define EUFS_SUPER_MAGIC 0x50C9 +/* Mount flags */ +#define EUFS_MOUNT_ERRORS_RO 0x000001 /* Remount fs ro on errors */ +#define EUFS_MOUNT_ERRORS_PANIC 0x000002 /* Panic on errors */ +#define EUFS_MOUNT_FORMAT 0x000004 /* was FS formatted on mount? */ + +#define NULL_ADDR ((u64)-1ll) +#define NULL_VAL (0) +#define NULL_ADDR_PTR ((void *)(NULL_ADDR)) + +/* FS Limits */ +#define EUFS_MAX_NAME_LEN (255) +#define EUFS_LINK_MAX (32000) /* max links to a file */ + +/* layout: hash_len (u64) + sym_link + trailing zero */ +#define EUFS_MAX_SYMLINK_LEN (PAGE_SIZE - sizeof(u64) - 1) +#define EUFS_SYMLINK_HASHLEN_LEN(hashlen) (((hashlen) >> 48) & 0xfff) +#define EUFS_SYMLINK_SIZE(len) ((len) + sizeof(u64) + 1) + +#define EUFS_BLOCK_SIZE (4096) +#define EUFS_BLOCK_SIZE_BITS (12) + +/* The initial height is 0 when the file tree contains no or one block */ +#define EUFS_MAX_FILE_TREE_HEIGHT 3 +#define EUFS_FILE_TREE_DEGREE_SHIFT 9 +#define EUFS_FILE_TREE_DEGREE (1U << EUFS_FILE_TREE_DEGREE_SHIFT) +#define EUFS_MAX_FILE_BLK_CNT \ + (1ll << (EUFS_MAX_FILE_TREE_HEIGHT * EUFS_FILE_TREE_DEGREE_SHIFT)) +#define EUFS_MAX_FILE_SIZE (4096ll * EUFS_MAX_FILE_BLK_CNT) + +#define EUFS_POISON_POINTER ((void *)0x1010101010101010UL) +#define EUFS_POISON_VALUE ((u64)0x1010101010101010UL) + +#define CACHELINE_SIZE (64) + +#define EUFS_ALLOC_BLOCKS_ZERO_NONE (0x0) /* Zero none NULL_ADDR pages */ +#define EUFS_ALLOC_BLOCKS_ZERO_ALL (0x1) /* Zero all NULL_ADDR pages */ +#define EUFS_ALLOC_BLOCKS_ZERO_EDGE (0x2) /* Zero edge NULL_ADDR pages */ + +#define EUFS_INODE_SIZE (CACHELINE_SIZE * 2) + +#define NV_DICT_CAPACITY (512ULL) +/* + * EOC stands for "End Of Chain". + * + * When volatile bucket (namely table[idx]) is EUFS_DIR_EOC_PTR, + * it means that both volatile bucket and persist bucket are empty. + * When volatile bucket is NULL, it just means that volatile + * bucket is empty. + * + * When volatile_next is EUFS_DIR_EOC, it means current entry is + * the last one in the chain although its next may still points + * to an entry (because the setting and persistence of next are + * deferred). When volatile_next is NULL, it means next should be + * checked to ensure whether or not the current entry is the last + * one in the chain. + */ +#define EUFS_DIR_EOC ((u64)-1) +#define EUFS_DIR_EOC_PTR ((void *)EUFS_DIR_EOC) +/* DIR DELeted NEW dentry */ +#define EUFS_DIR_DELNEW ((u64)0x3030303030303030UL) + +#endif /* EUFS_CONST_H */ diff --git a/fs/eulerfs/euler.h b/fs/eulerfs/euler.h new file mode 100644 index 000000000000..0abb7602bb63 --- /dev/null +++ b/fs/eulerfs/euler.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_H +#define EUFS_H + +#include <linux/crc16.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/version.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/uio.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +/* annotation for pointer to persistent memory */ +#define __pmem + +#define NV_CHECK (0) + +#if NV_CHECK +#pragma message "NV CHECK IS TURNED ON! NO PERF. EVAL.!" +#endif + +#if NV_CHECK +#define NV_ASSERT(x) \ + do { \ + if (!(x)) { \ + eufs_warn("assertion failed %s:%d: %s\n", __FILE__, \ + __LINE__, #x); \ + } \ + WARN(!(x), "detail:"); \ + } while (0) +#else +#define NV_ASSERT(x) +#endif + +#include "const.h" +#include "euler_dbg.h" +#include "nvm_struct.h" +#include "euler_def.h" +#include "kmem_cache.h" +#include "flush.h" +#include "euler_common.h" +#include "inode.h" +#include "nvalloc.h" + +extern int num_sockets; + +/* Function Prototypes */ +extern __printf(2, 3) void eufs_error_mng(struct super_block *sb, + const char *fmt, ...); + +/* dir.c */ +extern const struct file_operations eufs_dir_operations; + +/* file.c */ +extern const struct inode_operations eufs_file_inode_operations; +extern const struct file_operations eufs_file_operations; +int eufs_fsync(struct file *file, loff_t start, loff_t end, int datasync); + +/* inode.c */ +extern const struct address_space_operations eufs_aops; + +/* namei.c */ +extern const struct inode_operations eufs_dir_inode_operations; +extern const struct inode_operations eufs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations eufs_symlink_inode_operations; + +#endif /* EUFS_H */ diff --git a/fs/eulerfs/euler_common.h b/fs/eulerfs/euler_common.h new file mode 100644 index 000000000000..b7684de19c7d --- /dev/null +++ b/fs/eulerfs/euler_common.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_COMMON_H +#define EUFS_COMMON_H + +#include <linux/crc16.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/uio.h> +#include <linux/mutex.h> +#include <linux/version.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#ifndef EUFS_H +#error "Please include euler_common.h by including euler.h" +#endif + +#define EUFS_INODE_CNT_IN_RENAME 4 + +#define PAGE_DIV_ROUND_UP(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) +#define PAGE_DIV_ROUND_DOWN(x) (((x)) >> PAGE_SHIFT) + +#define clear_opt(o, opt) (o &= ~EUFS_MOUNT_##opt) +#define set_opt(o, opt) (o |= EUFS_MOUNT_##opt) +#define test_opt(sb, opt) (EUFS_SB(sb)->s_mount_opt & EUFS_MOUNT_##opt) + +static __always_inline void *o2p(struct super_block *sb, u64 offset); +static __always_inline u64 p2o(struct super_block *sb, void *ptr); + +static __always_inline struct eufs_sb_info *EUFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static __always_inline struct eufs_inode_info *EUFS_I(struct inode *inode) +{ + return container_of(inode, struct eufs_inode_info, vfs_inode); +} + +static __always_inline struct eufs_inode *EUFS_PI(struct inode *inode) +{ + return (struct eufs_inode *)o2p(inode->i_sb, inode->i_ino); +} + +static __always_inline unsigned long eufs_pi2ino(struct super_block *sb, + struct eufs_inode *pi) +{ + return p2o(sb, EUFS_HEAD_PI(pi)); +} + +static __always_inline struct eufs_super_block * +eufs_get_super(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return (struct eufs_super_block *)sbi->virt_addr; +} + +static __always_inline void *eufs_get_renamej(struct super_block *sb, int cpu) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return (void *)((u64)sbi->renamej + EUFS_RENAMEJ_ENTRY_SIZE * cpu); +} + +/* + * o: offset: u64 + * p: pointer: void * + * s: storage: __le64 + */ +static __always_inline void *o2p(struct super_block *sb, u64 offset) +{ + if (offset == 0) + return NULL; + if (offset == -1) + return (void *)-1; + return (void *)(EUFS_SB(sb)->virt_addr + offset); +} + +static __always_inline u64 p2o(struct super_block *sb, void *ptr) +{ + if (ptr == NULL) + return 0; + if (ptr == (void *)-1) + return -1; + return (u64)(ptr - EUFS_SB(sb)->virt_addr); +} + +/* pointer to storage */ +static __always_inline __le64 p2s(struct super_block *sb, void *ptr) +{ + return cpu_to_le64(p2o(sb, ptr)); +} +/* storage to pointer */ +static __always_inline void *s2p(struct super_block *sb, __le64 s) +{ + return o2p(sb, le64_to_cpu(s)); +} + +static __always_inline bool +eufs_access_ok(struct super_block *sb, const void *pointer, unsigned long sz) +{ + return true; +} + +#define eufs_ptr_fast_check_b(ptr) 0 + +#define eufs_ptr_fast_check(ptr) BUG_ON(eufs_ptr_fast_check_b(ptr)) + +#define HASHLEN_LEN(hashlen) (((hashlen) >> 48) & 0xff) + +static __always_inline hashlen_t hash(const char *name, size_t len) +{ + static const int seed = 131; + u64 r = 0; + int i; + + for (i = 0; i < len; ++i) + r = r * seed + (int)name[i]; + + return (u64)len << 48 | (r & 0xffffffffffff); +} + +static __always_inline bool key_equals(struct super_block *sb, const char *key, + hashlen_t hashlen, + const struct nv_dict_entry *de) +{ + int len; + struct nv_name_ext *p; + + NV_ASSERT(key); + NV_ASSERT(hashlen); + + if (hashlen != de->hv) + return false; + len = HASHLEN_LEN(hashlen); + if (likely(len <= FIRST_LEN)) + return memcmp(de->name, key, len) == 0; + if (memcmp(de->name, key, FIRST_LEN)) + return false; + eufs_dbg("first len ok\n"); + len -= FIRST_LEN; + p = s2p(sb, de->nextname); + key += FIRST_LEN; + while (len > FOLLOW_LEN) { + eufs_dbg("check again p:%*s key:%*s\n", (int)FOLLOW_LEN, + p->name, (int)FOLLOW_LEN, key); + if (memcmp(p->name, key, FOLLOW_LEN)) + return false; + p = s2p(sb, p->nextname); + key += FOLLOW_LEN; + len -= FOLLOW_LEN; + } + eufs_dbg("final check name p:%*s key:%*s\n", len, p->name, len, key); + return !memcmp(p->name, key, len); +} + +static __always_inline void eufs_flush_pi(struct eufs_inode *pi) +{ + eufs_flush_cacheline(pi); + eufs_flush_cacheline(&pi->i_fresh); +} + +static __always_inline void inode_dep_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline void inode_dep_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline int inode_is_dep_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline void inode_header_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_header_lock); +} +static __always_inline void inode_header_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_header_lock); +} +static __always_inline int inode_is_header_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_header_lock); +} +static __always_inline void inode_urgent_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_urgent_mutex); +} +static __always_inline void inode_urgent_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_urgent_mutex); +} +static __always_inline int inode_is_urgent_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_urgent_mutex); +} + +static __always_inline void inode_leaf_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_leaf_lock); +} + +static __always_inline void inode_leaf_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_leaf_lock); +} + +#endif /* EUFS_COMMON_H */ diff --git a/fs/eulerfs/euler_dbg.h b/fs/eulerfs/euler_dbg.h new file mode 100644 index 000000000000..fbd3851cb5cd --- /dev/null +++ b/fs/eulerfs/euler_dbg.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DBG_H +#define EUFS_DBG_H + +/* + * Debug code + */ + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +#define eufs_dbg(s, args...) +#define eufs_dbg_vlimit(s, args...) +#define eufs_dbg_dir(s, args...) + +#define eufs_crit(s, args...) pr_crit(s, ##args) +#define eufs_err(sb, s, args...) eufs_error_mng(sb, s, ##args) +#define eufs_warn(s, args...) pr_warn(s, ##args) +#define eufs_info(s, args...) \ + pr_info("(pid=%d,cmd=%s) " s, current->pid, current->comm, ##args) + +#endif /* EUFS_DBG_H */ diff --git a/fs/eulerfs/euler_def.h b/fs/eulerfs/euler_def.h new file mode 100644 index 000000000000..727f1c4cf181 --- /dev/null +++ b/fs/eulerfs/euler_def.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DEF_H +#define EUFS_DEF_H + +#ifndef EUFS_H +#error "Do not include euler_def.h directly. Include euler.h instead." +#endif + +#include <linux/cpufeature.h> +#include <linux/processor.h> +#include <linux/types.h> +#include <linux/magic.h> +#include <linux/delay.h> + +struct alloc_batch { + /* both in slots */ + long size; + long n_used; + void **batch; + long n_pending; + struct list_head list; +}; + +struct v_dict; + +enum { I_TRANS_NONE = 0, I_TRANS_AVAIL, I_TRANS_LOCKED }; + +struct eufs_inode_info { + struct list_head i_dep_list; /* A list of struct op_node to persist */ + /* protect operations on i_dep_list */ + struct mutex i_dep_lock; + + struct llist_node i_persistee_node; + + u32 i_next_dep_seq; + u32 i_persisted_dep_seq; + spinlock_t i_owner_lock; + struct list_head i_owner_list; + + /* regular file: pmem pointer */ + void __pmem *i_volatile_root; + struct v_dict *i_volatile_dict; + /* + * serialize the insertion of dependency nodes into the same + * directory by different processes or CPUs + */ + struct mutex i_header_lock; + + struct mutex i_urgent_mutex; + + int i_volatile_height; + u64 i_volatile_tree_blocks; + + u64 i_dotdot; + /* + * a inode can only be added into a persistence list once, + * so use i_is_persisting & inode_lock to ensure that. + */ + bool i_is_persisting; + /* whether or not the inode need persistence */ + bool i_is_dirty; + + int i_lock_transferred; + + bool hole_at_sta; /* the 0th data block is a hole */ + + u64 i_ext; + u16 i_version; + + struct alloc_batch page_batch; + /* serialize mmap with truncate/fallocate/write/unlink */ + struct rw_semaphore mmap_rwsem; + /* Protect pointers to leaf nodes (data pages) */ + struct mutex i_leaf_lock; + + spinlock_t i_dentry_persist_lock; + struct inode vfs_inode; +}; + +typedef u8 page_info_t; +struct page_wear; + +/* + * EulerFS super-block data in memory + */ +struct eufs_sb_info { + struct block_device *s_bdev; + struct dax_device *s_dax_dev; + phys_addr_t phys_addr; + void __pmem *virt_addr; + struct vm_struct *vm; + unsigned long block_start; + unsigned long block_end; + + void __pmem *renamej; + + u64 s_crash_ver; + + /* protects the SB's buffer-head */ + struct mutex s_lock; + + unsigned long blocksize; + unsigned long initsize; + unsigned long s_mount_opt; + atomic_t next_generation; + + /* Begin of Allocator */ + /* DRAM pools: + * - a single global pool + * - potected by page_lock and line_lock + * - a local pool per cpu + * - allocate/free from global pool in batch + * - no locks needed + * - a single (global) rest pool + * - when a page is used too many times, it is put into rest pool + * - cache lines are never put in rest pool + */ + spinlock_t large_lock; + spinlock_t page_lock; + spinlock_t line_lock; + struct mem_pool *gpool; + struct mem_pool *ppool; /* percpu variable */ + + spinlock_t rest_lock; + struct mem_pool *rest_pool; + + page_info_t __pmem *page_map; + void __pmem *data_start; + u64 npages; + + /* Other DRAM structures for the allcoator: + * + * - struct ptr_list_node: an unit for allocation (i.e., a page + * or a cacheline). + * + * - cached nodes: preallocated ptr_list_node for all pages, indexed by + * the page number. If the page is free, its ptr_list_node should + * be in some mem_pool. + * + * - line_node_ptrs: preallocated pointers for all pages. For each + * page, the pointer may point to an array of + * (PAGE_SIZE/CACHELINE_SIZE) ptr_list_nodes, each of which presents + * the allocation status of the corresponding cache line in the page. + * The array is dynamically allocated for memory conservation. + * + * - line_indicators: preallocated u8s for all pages. Each of the u8s + * records the number of cache lines available in global pool. This + * is used for cacheline coalescence. + * + * - page_wears: preallocated ints for all pages. Each of the ints + * records the number of writes to the page. This is used to + * coarse-grainedly show the degree of wear. + * + */ + struct ptr_list_node *cached_nodes; + struct ptr_list_node **line_node_ptrs; + u8 *line_indicators; /* Number of lines used per page! */ + + struct page_wear *page_wears; + + /* End of Allocator */ + + /* Begin of Persister */ + /* kmem cache for dep_node is universal defined in super.c */ + struct llist_head *persistee_list; /* percpu variable */ + struct task_struct **persisters; + bool *need_sync; /* for fssync */ + wait_queue_head_t sync_wq; /* for fssync's thread */ + struct mutex sync_mutex; /* serialize fssync request */ + /* End of Persister */ + + /* The word `draining` is reserved for volatility quota limitation */ + bool s_draining; + wait_queue_head_t s_draining_wq; + + atomic_t s_nr_dirty_inodes; + atomic_t s_nr_dep_nodes; + + struct mutex gather_mutex; +}; + +struct dir_scan_data { + struct super_block *sb; + struct dir_context *ctx; +}; + +typedef u64 hashlen_t; + +#endif /* EUFS_DEF_H */ diff --git a/fs/eulerfs/nvm_struct.h b/fs/eulerfs/nvm_struct.h new file mode 100644 index 000000000000..4818ae6d49bf --- /dev/null +++ b/fs/eulerfs/nvm_struct.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_NVM_STRUCT_H +#define EUFS_NVM_STRUCT_H + +#define EUFS_SB_SIZE 512 +#define EUFS_SB2_OFFSET 512 +#define EUFS_SB_PADDING (1024 * 2) + +/* Used by rename journal */ +#define EUFS_MAX_CPU_CNT 128 +#define EUFS_RENAMEJ_ENTRY_SIZE (1024) +#define EUFS_RENAMEJ_SIZE (EUFS_MAX_CPU_CNT * EUFS_RENAMEJ_ENTRY_SIZE) +#define EUFS_RENAMEJ_OFFSET (EUFS_SB_SIZE * 2 + EUFS_SB_PADDING) + +#define EUFS_CRC_SEED (~0) +#define EUFS_RENAME_IN_ACTION 1 + +/* + * Layout + * +------------------------+ + * | Super Block | 64B + * +------------------------+ + * | Padding | 512B-64B + * +------------------------+ + * | Seconary Super Block | 64B + * +------------------------+ + * | Padding | Aligned to 4K + * +------------------------+ + * + * +------------------------+ + * | | + * | pages | + * | | + * +------------------------+ + * | bitmap for pages | 4K-aligned + * +------------------------+ + * | Rename-Journals | 128K (128 cores * 1024B/core) + * +------------------------+ + * | | + * | pages | + * | | + * +------------------------+ + */ +/* + * Structure of the EulerFS super block. + */ +struct eufs_super_block { + /* checksum of this sb */ + __le16 s_sum; + /* magic signature */ + __le16 s_magic; + char s_safe_umount; + char s_flag; + __le16 s_fs_version; + /* 8 Bytes */ + + /* total size of fs in bytes */ + __le64 s_size; + /* base virtual address used in fs */ + __le64 s_virt_addr; + /* 24 Bytes */ + + char s_volume_name[16]; + /* 40 Bytes */ + + /* points to the location of mini-journal and rename journal */ + __le64 s_page_map; + /* 48 Bytes */ + + /* + * s_mtime(mount time) and s_wtime(write time) should be together and + * their order should not be changed. we use an 8 byte write to update + * both of them atomically. + */ + __le32 s_mtime; + __le32 s_wtime; + /* 56 Bytes */ + + __le64 s_root_pi; + /* 64 Bytes */ + __le64 s_crash_ver; +}; + +/* ========== directory & hash ========== */ +#define FIRST_LEN (CACHELINE_SIZE - sizeof(__le64) * 5) +#define FOLLOW_LEN (CACHELINE_SIZE - sizeof(__le64)) + +typedef u64 hashlen_t; +struct nv_dict { + __le64 __pmem table[NV_DICT_CAPACITY]; /* <struct nv_dict_entry *> */ +} __aligned(PAGE_SIZE); + +struct nv_dict_entry { + /* half a cache line (8B * 4) size in total */ + __le64 inode; /* <struct eufs_inode *> */ + __le64 next; /* <struct nv_dict_entry *> */ + __le64 volatile_next; /* <struct nv_dict_entry *> */ + /* store some filename */ + __le64 hv; /* <hashlen_t> hashlen */ + __le64 nextname; /* <char *> */ + char name[FIRST_LEN]; +} __aligned(CACHELINE_SIZE); + +struct nv_name_ext { + char name[FOLLOW_LEN]; + __le64 nextname; +} __aligned(CACHELINE_SIZE); + +#define EUFS_IS_HEAD_PI(pi) (!((u64)(pi) & (0x100 - 1))) + +#define EUFS_TWIN_PI(pi) \ + (EUFS_IS_HEAD_PI(pi) ? (((struct eufs_inode *)(pi)) + 1) : \ + (((struct eufs_inode *)(pi)) - 1)) + +#define EUFS_FRESH_PI(pi) \ + (((pi)->i_fresh >= EUFS_TWIN_PI(pi)->i_fresh) ? (pi) : \ + EUFS_TWIN_PI(pi)) + +#define EUFS_HEAD_PI(pi) (EUFS_IS_HEAD_PI(pi) ? (pi) : EUFS_TWIN_PI(pi)) + +/* ========== inode ========== */ +struct eufs_inode { + /* Cacheline 1: readmost part */ + /* 0 ~ 8 */ + __le32 i_flags; /* Inode flags */ + __le16 i_mode; /* File mode */ + __le16 i_version; /* Inode version */ + /* 8 ~ 16 */ + /* Note: the ctime to report is max(i_ctime, i_mtime) */ + __le64 i_ctime; /* Inode modification time (only for metadata) */ + /* 16 ~ 24 */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + /* 24 ~ 32 */ + __le64 i_dotdot; /* <struct eufs_inode *> parent inode (dir only) */ + /* 32 ~ 40 */ + __le64 i_ext; /* reserved for extension */ + /* 40 ~ 48 */ + __le32 i_ctime_nsec; /* nano sec */ + /* 48 ~ 56 */ + __le64 padding1; + /* 56 ~ 64 */ + __le64 padding2; + + /* Cacheline 2: readmost part */ + /* readwirte part */ + /* 0 ~ 8 */ + __le32 i_generation; /* File version (for NFS) */ + __le16 i_nlink; /* Links count */ + /* + * Freshness: we have twin-inodes here. When we access an inode, + * we compare the freshness of the two inodes and use the one with + * higher freshness. The freshness is only 16-bit, but we can easily + * handle the overflow. + */ + __le16 i_fresh; /* Freshness of the inode */ + /* 8 ~ 16 */ + __le64 i_mtime; /* Inode b-tree Modification time */ + /* 16 ~ 24 */ + __le64 i_atime; /* Access time */ + /* 24 ~ 32 */ + union { + __le64 i_root; /* btree root (regular only) */ + __le64 i_dict; /* dict root (dir only */ + __le32 i_rdev; /* major/minor (device only) */ + }; + /* 32 ~ 40 */ + /* + * Size: + * for directory: number of entries inside + * for regular: number of bytes stored + * others: not used + */ + __le64 i_size; /* Size of data in bytes */ + /* 40 ~ 48 */ + __le64 i_tree_blocks; /* #blocks allocated in btree (regular only) */ + + /* 48 ~ 56 */ + __le32 i_mtime_nsec; /* nano sec */ + __le32 i_atime_nsec; /* nano sec */ + /* 56 ~ 64 */ + __le64 padding3; +} __aligned(CACHELINE_SIZE); + +#define eufs_iread_flags(i) (le32_to_cpu((i)->i_flags)) +#define eufs_iread_mode(i) (le16_to_cpu((i)->i_mode)) +#define eufs_iread_ctime(i) (le64_to_cpu((i)->i_ctime)) +#define eufs_iread_uid(i) (le32_to_cpu((i)->i_uid)) +#define eufs_iread_gid(i) (le32_to_cpu((i)->i_gid)) +#define eufs_iread_dotdot(i) (le64_to_cpu((i)->i_dotdot)) + +#define eufs_iwrite_flags(i, v) ((i)->i_flags = cpu_to_le32(v)) +#define eufs_iwrite_mode(i, v) ((i)->i_mode = cpu_to_le16(v)) +#define eufs_iwrite_ctime(i, v) ((i)->i_ctime = cpu_to_le64(v)) +#define eufs_iwrite_uid(i, v) ((i)->i_uid = cpu_to_le32(v)) +#define eufs_iwrite_gid(i, v) ((i)->i_gid = cpu_to_le32(v)) +#define eufs_iwrite_dotdot(i, v) ((i)->i_dotdot = cpu_to_le64(v)) + +#define eufs_iread_version(i) (le16_to_cpu((i)->i_version)) +#define eufs_iread_ctime_nsec(i) (le32_to_cpu((i)->i_ctime_nsec)) +#define eufs_iread_ext(i) (le64_to_cpu((i)->i_ext)) +#define eufs_iwrite_version(i, v) ((i)->i_version = cpu_to_le16(v)) +#define eufs_iwrite_ctime_nsec(i, v) ((i)->i_ctime_nsec = cpu_to_le32(v)) +#define eufs_iwrite_ext(i, v) ((i)->i_ext = cpu_to_le64(v)) + +#define eufs_writemostly_inode(i) ((i)) + +#define eufs_iread_generation(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_generation)) +#define eufs_iread_nlink(i) (le16_to_cpu(eufs_writemostly_inode(i)->i_nlink)) +#define eufs_iread_mtime(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_mtime)) +#define eufs_iread_atime(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_atime)) +#define eufs_iread_root(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_root)) +#define eufs_iread_dict(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_dict)) +#define eufs_iread_rdev(i) (le32_to_cpu(eufs_writemostly_inode(i)->i_rdev)) +#define eufs_iread_size(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_size)) +#define eufs_iread_tree_blocks(i) \ + (le64_to_cpu(eufs_writemostly_inode(i)->i_tree_blocks)) + +#define eufs_iwrite_generation(i, v) \ + (eufs_writemostly_inode(i)->i_generation = cpu_to_le32(v)) +#define eufs_iwrite_nlink(i, v) \ + (eufs_writemostly_inode(i)->i_nlink = cpu_to_le16(v)) +#define eufs_iwrite_mtime(i, v) \ + (eufs_writemostly_inode(i)->i_mtime = cpu_to_le64(v)) +#define eufs_iwrite_atime(i, v) \ + (eufs_writemostly_inode(i)->i_atime = cpu_to_le64(v)) +#define eufs_iwrite_root(i, v) \ + (eufs_writemostly_inode(i)->i_root = cpu_to_le64(v)) +#define eufs_iwrite_dict(i, v) \ + (eufs_writemostly_inode(i)->i_dict = cpu_to_le64(v)) +#define eufs_iwrite_rdev(i, v) \ + (eufs_writemostly_inode(i)->i_rdev = cpu_to_le32(v)) +#define eufs_iwrite_size(i, v) \ + (eufs_writemostly_inode(i)->i_size = cpu_to_le64(v)) +#define eufs_iwrite_tree_blocks(i, v) \ + (eufs_writemostly_inode(i)->i_tree_blocks = cpu_to_le64(v)) + +#define eufs_iread_mtime_nsec(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_mtime_nsec)) +#define eufs_iread_atime_nsec(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_atime_nsec)) +#define eufs_iwrite_mtime_nsec(i, v) \ + (eufs_writemostly_inode(i)->i_mtime_nsec = cpu_to_le32(v)) +#define eufs_iwrite_atime_nsec(i, v) \ + (eufs_writemostly_inode(i)->i_atime_nsec = cpu_to_le32(v)) + +static inline void eufs_iwrite_ctime_mtime(struct eufs_inode *pi, + struct inode *vi) +{ + eufs_iwrite_ctime(pi, vi->i_ctime.tv_sec); + eufs_iwrite_ctime_nsec(pi, vi->i_ctime.tv_nsec); + + eufs_iwrite_mtime(pi, vi->i_mtime.tv_sec); + eufs_iwrite_mtime_nsec(pi, vi->i_mtime.tv_nsec); +} + +struct eufs_renamej { + __le32 crc; + __le32 flags; + __le64 addr_of_oldnext; + __le64 oldnext; + __le64 addr_of_newde; + __le64 composed_newde; /* composed as list header */ + __le64 newde_inode; + __le64 old_dir_pi; + __le64 new_dir_pi; + + __le64 time; + __le32 time_nsec; + __le16 old_link; + __le16 new_link; + __le32 old_size; + __le32 new_size; + __u8 pad[40]; +} __aligned(CACHELINE_SIZE); + +typedef u8 page_info_t; +typedef u8 line_info_t; + +struct embedded_line_info { + line_info_t gens[64]; +}; + +#endif /* EUFS_NVM_STRUCT_H */ diff --git a/fs/eulerfs/pbatch.h b/fs/eulerfs/pbatch.h new file mode 100644 index 000000000000..1a7bcf089213 --- /dev/null +++ b/fs/eulerfs/pbatch.h @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_PBATCH_H +#define EUFS_PBATCH_H + +/** + * To prevent data races, only two cases are allowed: + * 1) nvmalloc -> alloc_batch_persist -> nvfree + * 2) nvmalloc -> nvfree + */ + +/** + * eufs_alloc_batch_* API usage: + * + * struct alloc_batch batch; + * [ eufs_alloc_batch_init(&batch, estimated_size); ] + * eufs_alloc_batch_hint(&batch, estimated_size); + * eufs_alloc_batch_add(&batch, the_page_pointer); + * eufs_alloc_batch_add(&batch, the_page_pointer); + * ... + * eufs_alloc_batch_add(&batch, the_page_pointer); + * eufs_alloc_batch_persist_reset(&batch); + * + * eufs_alloc_batch_fini(&batch); + * + */ +/* TODO: consider using list? */ + +#define EUFS_AB_MAX_SIZE (KMALLOC_MAX_SIZE / 8) + +/* log2(cache_line size / page_info_t size) */ +#define EUFS_PMAP_CNT_SHIFT_PER_CACHELINE 6 + +static __always_inline void eufs_alloc_batch_hint(struct alloc_batch *pb, + ssize_t size); +static __always_inline void +eufs_alloc_batch_persist_reset(struct super_block *sb, struct alloc_batch *pb); +static __always_inline void eufs_alloc_batch_init(struct alloc_batch *pb, + ssize_t size) +{ + pb->n_used = 0; + pb->batch = NULL; + pb->size = 0; + pb->n_pending = 0; + eufs_alloc_batch_hint(pb, size); + BUG_ON(!pb->batch); +} + +/* This gives only hints, no guarantees. */ +static __always_inline void eufs_alloc_batch_hint(struct alloc_batch *pb, + ssize_t size) +{ + ssize_t realsize; + void **batch; + + realsize = round_up(size * sizeof(void *), PAGE_SIZE); + if (realsize > KMALLOC_MAX_SIZE) + realsize = KMALLOC_MAX_SIZE; + size = realsize / sizeof(void *); + + if (pb->size >= size) + return; + batch = krealloc(pb->batch, realsize, GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(batch == NULL); + pb->batch = batch; + pb->size = size; + eufs_dbg("! eufs_alloc_batch_hint ; ab=%px size=%ld\n", pb, size); +} + +static __always_inline void eufs_alloc_batch_hint_off(struct alloc_batch *pb, + ssize_t off_size) +{ + eufs_alloc_batch_hint(pb, pb->size + pb->n_pending + off_size); +} + +static __always_inline void eufs_alloc_batch_fini(struct alloc_batch *pb) +{ + kfree(pb->batch); + pb->batch = NULL; + pb->size = pb->n_used = 0; +} +/* Add an already allocated address */ +static __always_inline void eufs_alloc_batch_add(struct super_block *sb, + struct alloc_batch *pb, + void *page) +{ + if (pb->n_used == pb->size) { + /* Enlarge */ + if (pb->size == EUFS_AB_MAX_SIZE) + eufs_alloc_batch_persist_reset(sb, pb); + else + eufs_alloc_batch_hint(pb, pb->size * 2); + BUG_ON(pb->n_used >= pb->size); + } + BUG_ON(pb->n_used >= pb->size); + pb->batch[pb->n_used] = page; + pb->n_used++; +} + +/* + * With the following four functions, alloc_batch can be used as a pool of + * preallocation. + */ +static __always_inline int +eufs_alloc_batch_pre_allocate_begin(struct super_block *sb, + struct alloc_batch *ab, size_t need_blocks) +{ + long r; + BUG_ON(ab->n_pending); + eufs_alloc_batch_hint_off(ab, need_blocks); + ab->n_pending = need_blocks; + r = nvmalloc_pre(sb, ab, need_blocks, PAGE_SIZE); + if (r) + ab->n_pending = 0; + return r; +} +static __always_inline void +eufs_alloc_batch_pre_allocate_end(struct super_block *sb, + struct alloc_batch *ab) +{ + WARN((ab->n_pending != 0), + "Some pre-allocated pages are not used in %px!\n", ab); + BUG_ON(!list_empty(&ab->list)); +} + +/* Allocate from the pre-allocated addresses */ +static __always_inline void *eufs_alloc_batch_allocate(struct super_block *sb, + struct alloc_batch *ab, + u8 tag) +{ + void *page = NULL; + /* used up */ + BUG_ON(ab->n_pending <= 0); + page = nvmalloc_pre_get_from_list(sb, &ab->list, tag); + BUG_ON(!page); + ab->n_pending--; + eufs_alloc_batch_add(sb, ab, page); + return page; +} +static __always_inline void * +eufs_alloc_batch_allocate_file_index(struct super_block *sb, + struct alloc_batch *ab) +{ + return eufs_alloc_batch_allocate(sb, ab, EUFS_PAGE_FILE_INDEX); +} +static __always_inline void * +eufs_alloc_batch_allocate_file_data(struct super_block *sb, + struct alloc_batch *ab) +{ + return eufs_alloc_batch_allocate(sb, ab, EUFS_PAGE_FILE_DATA); +} + +static int cmp_func(const void *a, const void *b) +{ + const void **_a = (const void **)a; + const void **_b = (const void **)b; + + if (*_a > *_b) + return 1; + if (*_a < *_b) + return -1; + return 0; +} +#define _PAGE_NO(ptr) (((u64)ptr - (u64)sbi->data_start) / PAGE_SIZE) +#define _LINE_MAP(addr) ((line_info_t *)((u64)(addr)&PAGE_MASK)) +#define _IS_LINE(addr) ((u64)addr % PAGE_SIZE) +static __always_inline void _set_bitmap(struct eufs_sb_info *sbi, u64 addr, + bool forced) +{ + u64 page_no = _PAGE_NO(addr); + u64 rem = addr % PAGE_SIZE; + line_info_t __pmem *line_map; + /* no one can free this address now, so no race will happen */ + struct ptr_list_node *node; + int line_no; + + if (rem == 0) { + /* page */ + node = sbi->cached_nodes + (page_no); + if (!forced) { + BUG_ON(node->solid); + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE); + } + WARN(node->tag == 0, + "unexpected page node tag %u (addr 0x%llx)\n", node->tag, + addr); + sbi->page_map[page_no] = node->tag; + node->solid = true; + } else { + /* line */ + BUG_ON(rem % CACHELINE_SIZE != 0); + + line_map = (void *)(addr - rem); + line_no = rem / CACHELINE_SIZE; + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE && + sbi->page_map[page_no] != EUFS_PAGE_LINE_USED); + /* \ _set _unset + * _set idempotent + * _unset + */ + if (sbi->page_map[page_no] == EUFS_PAGE_FREE) { + /* idempotent */ + sbi->page_map[page_no] = EUFS_PAGE_LINE_USED; + node = sbi->cached_nodes + (page_no); + BUG_ON(!node->busy); + node->solid = true; + } + + node = &sbi->line_node_ptrs[page_no][line_no]; + if (!forced) { + BUG_ON(node->solid); + if (line_map[line_no]) { + eufs_info( + "!line_map[line_no] = %px[%d] = %d\n", + line_map, line_no, line_map[line_no]); + BUG(); + } + BUG_ON(line_map[line_no]); + } + WARN(node->tag == 0, + "unexpected line node tag %u (addr 0x%llx)\n", node->tag, + addr); + line_map[line_no] = node->tag; + eufs_dbg("set %px[%d] = %d forced=%d\n", line_map, line_no, + line_map[line_no], forced); + node->solid = true; + BUG_ON(!node->busy); + } +} + +static __always_inline void +eufs_alloc_batch_persist_reset(struct super_block *sb, struct alloc_batch *pb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 page_no, page_no0; + int i; + + if (pb->n_used == 0) + goto reset; + if (pb->size == 0) + goto reset; + + BUG_ON(!pb->batch); + + sort(pb->batch, pb->n_used, sizeof(void *), cmp_func, NULL); + + for (i = 0; i < pb->n_used; ++i) { + if (i > 0 && pb->batch[i] == pb->batch[i - 1]) { + pr_info("!pb->batch[i]=%px [i-1]=%px i=%d\n", + pb->batch[i], pb->batch[i - 1], i); + BUG(); + } + _set_bitmap(sbi, (u64)pb->batch[i], false); + } + + page_no0 = _PAGE_NO(pb->batch[0]); + if (_IS_LINE(pb->batch[0])) + eufs_flush_cacheline(_LINE_MAP(pb->batch[0])); + eufs_flush_cacheline(&sbi->page_map[page_no0]); + + for (i = 1; i < pb->n_used; ++i) { + page_no = _PAGE_NO(pb->batch[i]); + if (page_no == page_no0) + /* same page, must be allocation of two cache lines */ + continue; + + /* different page */ + if (_IS_LINE(pb->batch[i])) + eufs_flush_cacheline(_LINE_MAP(pb->batch[i])); + + /* not in a single cache line */ + if ((page_no >> EUFS_PMAP_CNT_SHIFT_PER_CACHELINE) != + (page_no0 >> EUFS_PMAP_CNT_SHIFT_PER_CACHELINE)) + eufs_flush_cacheline(&sbi->page_map[page_no]); + page_no0 = page_no; + } + + eufs_dbg("!persistallocation: pb=%px sorted %px~%px %ld\n", pb, + pb->batch[0], pb->batch[pb->n_used - 1], pb->n_used); +reset: + pb->n_used = 0; +} + +static __always_inline void eufs_alloc_persist(struct super_block *sb, + void *ptr, bool forced) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 page_no = _PAGE_NO(ptr); + + _set_bitmap(sbi, (u64)ptr, forced); + + if (_IS_LINE(ptr)) + eufs_flush_cacheline(_LINE_MAP(ptr)); + + eufs_flush_cacheline(&sbi->page_map[page_no]); +} + +#undef _PAGE_NO +#undef _LINE_MAP +#undef _IS_LINE + +#endif /* EUFS_PBATCH_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Total three kmeme_cache: - dep_node - page - inode
Interfaces including init kmem_cache, destroy kmem_cache, alloc and free object from kmem_cache.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/kmem_cache.c | 107 ++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/kmem_cache.h | 37 ++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 fs/eulerfs/kmem_cache.c create mode 100644 fs/eulerfs/kmem_cache.h
diff --git a/fs/eulerfs/kmem_cache.c b/fs/eulerfs/kmem_cache.c new file mode 100644 index 000000000000..8b8299edf8ad --- /dev/null +++ b/fs/eulerfs/kmem_cache.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include "euler.h" +#include "kmem_cache.h" +#include "dep.h" + +static struct kmem_cache *eufs_dep_node_cachep; +static struct kmem_cache *eufs_page_cachep; +static struct kmem_cache *eufs_inode_cachep; + +static void init_once(void *foo) +{ + struct eufs_inode_info *vi = foo; + + inode_init_once(&vi->vfs_inode); +} + +int __init init_page_cache(void) +{ + eufs_page_cachep = kmem_cache_create( + "eufs_page_cache", PAGE_SIZE, 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_HWCACHE_ALIGN), + NULL); + if (eufs_page_cachep == NULL) + return -ENOMEM; + return 0; +} + +int __init init_dep_node_cache(void) +{ + eufs_dep_node_cachep = kmem_cache_create( + "eufs_dep_node_cache", sizeof(struct dep_node), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_HWCACHE_ALIGN), + NULL); + if (eufs_dep_node_cachep == NULL) + return -ENOMEM; + return 0; +} + +int __init init_inodecache(void) +{ + eufs_inode_cachep = kmem_cache_create( + "eufs_inode_cache", sizeof(struct eufs_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD), init_once); + if (eufs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +void destroy_page_cache(void) +{ + kmem_cache_destroy(eufs_page_cachep); +} + +void destroy_inodecache(void) +{ + rcu_barrier(); + kmem_cache_destroy(eufs_inode_cachep); +} + +void destroy_dep_node_cache(void) +{ + kmem_cache_destroy(eufs_dep_node_cachep); +} + +void *eufs_zalloc_page(void) +{ + return kmem_cache_zalloc(eufs_page_cachep, GFP_NOFS); +} +void *eufs_alloc_page(void) +{ + return kmem_cache_alloc(eufs_page_cachep, GFP_NOFS); +} +void eufs_free_page(void *page) +{ + kmem_cache_free(eufs_page_cachep, page); +} + +struct dep_node *eufs_alloc_dep_node(void) +{ + return kmem_cache_alloc(eufs_dep_node_cachep, GFP_NOFS); +} +void eufs_free_dep_node(struct dep_node *dep) +{ + kmem_cache_free(eufs_dep_node_cachep, dep); +} + +struct eufs_inode_info *eufs_alloc_vi(void) +{ + return kmem_cache_alloc(eufs_inode_cachep, GFP_NOFS); +} +void eufs_free_vi(struct eufs_inode_info *vi) +{ + kmem_cache_free(eufs_inode_cachep, vi); +} diff --git a/fs/eulerfs/kmem_cache.h b/fs/eulerfs/kmem_cache.h new file mode 100644 index 000000000000..94718b4bc531 --- /dev/null +++ b/fs/eulerfs/kmem_cache.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_KMEM_CACHE_H +#define EUFS_KMEM_CACHE_H + +#include <linux/module.h> + +extern int init_page_cache(void) __init; +extern int init_dep_node_cache(void) __init; +extern int init_inodecache(void) __init; + +extern void destroy_page_cache(void); +extern void destroy_inodecache(void); +extern void destroy_dep_node_cache(void); + +extern void *eufs_zalloc_page(void); +extern void *eufs_alloc_page(void); +extern void eufs_free_page(void *page); + +extern struct dep_node *eufs_alloc_dep_node(void); +extern void eufs_free_dep_node(struct dep_node *dep); + +extern struct eufs_inode_info *eufs_alloc_vi(void); +extern void eufs_free_vi(struct eufs_inode_info *vi); + +#endif /* EUFS_KMEM_CACHE_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement memory related operations for memory management.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: sunqiuyang sunqiuyang@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/alloc_interface.h | 113 +++ fs/eulerfs/nvalloc.c | 1451 ++++++++++++++++++++++++++++++++++ fs/eulerfs/nvalloc.h | 214 +++++ 3 files changed, 1778 insertions(+) create mode 100644 fs/eulerfs/alloc_interface.h create mode 100644 fs/eulerfs/nvalloc.c create mode 100644 fs/eulerfs/nvalloc.h
diff --git a/fs/eulerfs/alloc_interface.h b/fs/eulerfs/alloc_interface.h new file mode 100644 index 000000000000..22d30c7672e0 --- /dev/null +++ b/fs/eulerfs/alloc_interface.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_ALLOC_INTERFACE_H +#define EUFS_ALLOC_INTERFACE_H + +#include "nvalloc.h" +#include "pbatch.h" + +static __always_inline void *nvzalloc(struct super_block *sb, size_t size, + u8 tag, bool nonblocking) +{ + void *r = nvmalloc(sb, size, tag, nonblocking); + + if (r) + memset(r, 0, size); + + return r; +} + +static __always_inline void * +nv_zalloc_file_data_nonblocking(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, true); +} + +struct eufs_inode; +struct nv_name_ext; +struct nv_dict_entry; + +static __always_inline struct eufs_inode * +eufs_malloc_pinode(struct super_block *sb) +{ + /* mirrored inodes: the head inode and the tail inode */ + return nvmalloc(sb, EUFS_INODE_SIZE * 2, EUFS_LINE4_INODE, false); +} +static __always_inline struct nv_dict_entry * +eufs_malloc_dentry(struct super_block *sb) +{ + return nvmalloc(sb, CACHELINE_SIZE, EUFS_LINE_DENTRY, false); +} +static __always_inline struct nv_name_ext * +eufs_malloc_name_ext(struct super_block *sb) +{ + return nvmalloc(sb, CACHELINE_SIZE, EUFS_LINE_NAME_EXT, false); +} + +static __always_inline void *eufs_malloc_file_data(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, false); +} +static __always_inline void *eufs_zalloc_file_data(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, false); +} +static __always_inline void *eufs_zmlloc_file_index(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_INDEX, false); +} +static __always_inline void *eufs_zalloc_symlink(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_SYMLINK, false); +} +static __always_inline void *eufs_zalloc_htable(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_HTABLE, false); +} +static __always_inline void *eufs_malloc_inode_ext(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_INODE_EXT, false); +} + +static __always_inline void nv_zfree(struct super_block *sb, void *p) +{ + if (p == NULL_ADDR_PTR) + return; + + nvfree(sb, p, false); +} + +static __always_inline void nv_free(struct super_block *sb, void *p) +{ + if (p != NULL_ADDR_PTR) + nv_zfree(sb, p); +} + +static __always_inline void nv_free_rest(struct super_block *sb, void *p) +{ + if (p != NULL_ADDR_PTR) + nvfree(sb, p, true); +} + +static __always_inline void *zalloc(ssize_t size) +{ + return kzalloc(size, GFP_KERNEL); +} + +static __always_inline void zfree(void *p) +{ + kfree(p); +} + +#endif /* EUFS_ALLOC_INTERFACE_H */ diff --git a/fs/eulerfs/nvalloc.c b/fs/eulerfs/nvalloc.c new file mode 100644 index 000000000000..8b60a2494636 --- /dev/null +++ b/fs/eulerfs/nvalloc.c @@ -0,0 +1,1451 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/ratelimit.h> +#include "nvalloc.h" +#include "euler.h" + +static __always_inline void print_ptr_list_node(struct ptr_list_node *node) +{ + eufs_info("========> &ptr_list_node = %px <==========\n", node); + eufs_info("= node => .prev=%px .next=%px\n", node->node.prev, + node->node.next); + eufs_info("= ptr =%px\n", node->ptr); + eufs_info("======== reported @cpu=%d =============\n", + smp_processor_id()); +} + +static __always_inline void memclr(void *ptr, size_t len) +{ + memset(ptr, 0, len); +} + +static __always_inline void *eufs_get_page(struct super_block *sb, int page_no) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return sbi->data_start + page_no * PAGE_SIZE; +} + +void eufs_get_layout(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + unsigned long start_addr = (u64)sbi->virt_addr; + ssize_t len = sbi->initsize; + + unsigned long ptr; + ssize_t page_map_size; + + /* only support 4K page now */ + BUG_ON(PAGE_SIZE != 4096); + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + + /* align the start to 4K */ + ptr = round_up(start_addr, PAGE_SIZE); + len -= (ptr - start_addr); + + sbi->npages = len / PAGE_SIZE; /* round down */ + sbi->data_start = (void *)((uintptr_t) ptr); + + /* skip the first 4K, reserved for super blocks */ + ptr += PAGE_SIZE; + len -= PAGE_SIZE; + + /* get page-map */ + if (init) + sbi->page_map = (page_info_t *)ptr; + page_map_size = round_up(sbi->npages * sizeof(page_info_t), PAGE_SIZE); + + ptr += page_map_size; + len -= page_map_size; + + /* skip for renamej */ + sbi->renamej = (void *)ptr; + ptr += EUFS_RENAMEJ_SIZE; + len -= EUFS_RENAMEJ_SIZE; + if (init) { + /* clear the pagemap */ + memclr(sbi->page_map, page_map_size); + memclr(sbi->renamej, EUFS_RENAMEJ_SIZE); + eufs_flush_buffer(sbi->renamej, EUFS_RENAMEJ_SIZE, true); + } +} + +static void partition_page(struct eufs_sb_info *sbi, int page_no, + line_info_t *gens, int *line4_cpu, + int *line4_countdown) +{ + struct ptr_list_node *node; + int i = page_no; + int j; + + /* no cache line is in global pool */ + sbi->line_indicators[i] = 0; + for (j = 1; j < 64; ++j) { + node = &sbi->line_node_ptrs[i][j]; + node->ptr = ((void *)gens) + CACHELINE_SIZE * j; + if (gens[j] == EUFS_LINE_DENTRY || + gens[j] == EUFS_LINE_NAME_EXT) { + /* line used */ + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = gens[j]; + continue; + } + if (gens[j] == EUFS_LINE4_INODE) { + int k; + /* linex4 used */ + node->busy = true; + node->solid = true; + node->multiple = true; + node->tag = gens[j]; + for (k = 1; k < 4; ++k) { + sbi->line_node_ptrs[i][j + k].ptr = + ((void *)gens) + + CACHELINE_SIZE * (j + k); + sbi->line_node_ptrs[i][j + k].busy = false; + sbi->line_node_ptrs[i][j + k].solid = false; + sbi->line_node_ptrs[i][j + k].multiple = false; + } + j += 3; + continue; + } + /* EUFS_LINE_FREE */ + if ((j & 3) == 0 && + /* probe */ + (gens[j + 1] == EUFS_LINE_FREE && + gens[j + 2] == EUFS_LINE_FREE && + gens[j + 3] == EUFS_LINE_FREE)) { + struct mem_pool *line4_ppool; + int k; + + node->busy = false; + node->solid = false; + node->multiple = true; + for (k = 1; k < 4; ++k) { + sbi->line_node_ptrs[i][j + k].ptr = + ((void *)gens) + + CACHELINE_SIZE * (j + k); + sbi->line_node_ptrs[i][j + k].busy = false; + sbi->line_node_ptrs[i][j + k].solid = false; + sbi->line_node_ptrs[i][j + k].multiple = false; + } + if (*line4_countdown == 0) { + /* switch to next cpu */ + *line4_cpu = cpumask_next(*line4_cpu, + cpu_possible_mask); + if (*line4_cpu >= nr_cpu_ids) + *line4_cpu = cpumask_next( + -1, cpu_possible_mask); + *line4_countdown = EUFS_PRE_PAGES_PERCPU; + } + line4_ppool = per_cpu_ptr(sbi->ppool, *line4_cpu); + list_add(&node->node, &line4_ppool->line4_list); + line4_ppool->nline4s++; + (*line4_countdown)--; + j += 3; + continue; + } + node->busy = false; + node->solid = false; + node->multiple = false; + ++sbi->line_indicators[i]; + list_add(&node->node, &sbi->gpool->line_list); + sbi->gpool->nlines++; + } +} + +static bool probe_large_page(struct eufs_sb_info *sbi, long page_no) +{ + long i = page_no; + int k; + + for (k = 1; k < 512; ++k) { + if (sbi->page_map[i + k] != EUFS_PAGE_FREE) + return false; + } + return true; +} + +/* Partition the area into multiple zones */ +static void partition(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 start_addr = (u64)sbi->virt_addr; + u64 len = sbi->initsize; + u64 npages_percpu; + u64 cpu_page_left; + u64 start_page; + int cpu; + int i; + int k; + struct mem_pool *pool; + struct ptr_list_node *node; + ssize_t page_map_size; + int line4_cpu; + int line4_countdown; + + /* + * The status of 64 cache-lines in a pmem page are tracked by + * 64 ptr_list_node in volatile page, so check whether or not + * the size of ptr_list_node is too large. + */ + BUILD_BUG_ON(64 * sizeof(struct ptr_list_node) > PAGE_SIZE); + + eufs_get_layout(sb, init); + page_map_size = round_up(sbi->npages * sizeof(page_info_t), PAGE_SIZE); + + /* allocate space for volatile allocator */ + sbi->cached_nodes = vmalloc(sizeof(struct ptr_list_node) * sbi->npages); + memclr(sbi->cached_nodes, sizeof(struct ptr_list_node) * sbi->npages); + + /* pointers reserved for cache line nodes for a page (64 lines) */ + sbi->line_node_ptrs = + vmalloc(sizeof(struct ptr_list_node *) * sbi->npages); + memclr(sbi->line_node_ptrs, + sizeof(struct ptr_list_node *) * sbi->npages); + + sbi->line_indicators = + vmalloc(sizeof(*sbi->line_indicators) * sbi->npages); + memclr(sbi->line_indicators, + sizeof(*sbi->line_indicators) * sbi->npages); + + i = 0; + if (init) { + unsigned int reserved_pages; + + eufs_info("start: %llx, len=%llu\n", start_addr, len); + + /* +1 for super block */ + reserved_pages = + 1 + page_map_size / PAGE_SIZE + + round_up(EUFS_RENAMEJ_SIZE, PAGE_SIZE) / PAGE_SIZE; + while (reserved_pages-- > 0) + sbi->page_map[i++] = EUFS_PAGE_RESERVED; + + eufs_flush_buffer(sbi->page_map, page_map_size, true); + } + + npages_percpu = EUFS_PRE_PAGES_PERCPU; + + cpu = -1; + cpu_page_left = 0; + start_page = 0; + + /* init spinlock for gpool */ + spin_lock_init(&sbi->large_lock); + spin_lock_init(&sbi->page_lock); + spin_lock_init(&sbi->line_lock); + spin_lock_init(&sbi->rest_lock); + + sbi->gpool->nlarges = 0; + sbi->gpool->npages = 0; + sbi->gpool->nlines = 0; + + line4_cpu = cpumask_next(-1, cpu_possible_mask); + line4_countdown = npages_percpu; + + for (; i < sbi->npages; ++i) { + if (cpu_page_left == 0) { + eufs_info( + "%s for cpu=%d, page=[%llu~%llu) [%px~%px)\n", + __func__, cpu, (u64)start_page, (u64)i, + eufs_get_page(sb, start_page), + eufs_get_page(sb, i)); + if (cpu < (int)nr_cpu_ids) + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) { + pool = sbi->gpool; + cpu_page_left = + sbi->npages; /* never exhausted */ + } else { + pool = per_cpu_ptr(sbi->ppool, cpu); + cpu_page_left = npages_percpu; + } + start_page = i; + } + node = sbi->cached_nodes + (i); + node->ptr = eufs_get_page(sb, i); + switch (sbi->page_map[i]) { + case EUFS_PAGE_LARGE_USED: + node->busy = true; + node->solid = true; + node->multiple = true; + node->tag = sbi->page_map[i]; + WARN(((u64)node->ptr) & ((2 << 20) - 1), + "EulerFS unalinged large page!"); + for (k = 1; k < 512; ++k) { + sbi->cached_nodes[i + k].ptr = + eufs_get_page(sb, i + k); + sbi->cached_nodes[i + k].busy = false; + sbi->cached_nodes[i + k].solid = false; + sbi->cached_nodes[i + k].multiple = false; + } + i += 511; + continue; + /* case EUFS_PAGE_USED: */ + case EUFS_PAGE_RESERVED: + case EUFS_PAGE_FILE_DATA: + case EUFS_PAGE_FILE_INDEX: + case EUFS_PAGE_HTABLE: + case EUFS_PAGE_SYMLINK: + case EUFS_PAGE_INODE_EXT: + BUG_ON(init); + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = sbi->page_map[i]; + /* page used */ + continue; + case EUFS_PAGE_LINE_USED: + BUG_ON(init); + /* page used as cache lines */ + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = sbi->page_map[i]; + + /* TODO: add cache lines */ + BUG_ON(sbi->line_node_ptrs[i]); + sbi->line_node_ptrs[i] = eufs_zalloc_page(); + + partition_page(sbi, i, node->ptr, &line4_cpu, + &line4_countdown); + + break; + case EUFS_PAGE_FREE: + /* allocate and fill the node */ + node->busy = false; + node->solid = false; + + if ((((u64)node->ptr) & ((2 << 20) - 1)) == 0 && + probe_large_page(sbi, i)) { + /* insert as large page */ + node->multiple = true; + + list_add(&node->node, &pool->large_list); + pool->nlarges++; + + cpu_page_left--; + + for (k = 1; k < 512; ++k) { + sbi->cached_nodes[i + k].ptr = + eufs_get_page(sb, i + k); + sbi->cached_nodes[i + k].busy = false; + sbi->cached_nodes[i + k].solid = false; + sbi->cached_nodes[i + k].multiple = + false; + } + i += 511; + } else { + /* insert to ppool */ + node->multiple = false; + list_add(&node->node, &pool->page_list); + pool->npages++; + + cpu_page_left--; + } + break; + default: + eufs_warn( + "Invalid value 0x%x in pagemap[%d] is detected!\n", + sbi->page_map[i], i); + continue; + } + } + if (cpu < nr_cpu_ids) + eufs_info("%s for cpu=%d, page=[%llu~%llu) [%px~%px)\n", + __func__, cpu, (u64)start_page, (u64)i, + eufs_get_page(sb, start_page), eufs_get_page(sb, i)); + else + eufs_info("%s for global pool, page=[%llu~%llu)\n", + __func__, start_page, (u64)i); +} + +static void return_page(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + unsigned long flags; + u64 page_num = (node->ptr - sbi->data_start) / PAGE_SIZE; + + sbi->page_map[page_num] = EUFS_PAGE_FREE; + eufs_flush_cacheline(&sbi->page_map[page_num]); + eufs_pbarrier(); + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + spin_lock_irqsave(&sbi->rest_lock, flags); + list_add(&node->node, &sbi->rest_pool->page_list); + sbi->rest_pool->npages++; + spin_unlock_irqrestore(&sbi->rest_lock, flags); + } else if (ppool->npages >= LOCAL_PAGE_MAX) { + spin_lock_irqsave(&sbi->page_lock, flags); + list_add(&node->node, &sbi->gpool->page_list); + sbi->gpool->npages++; + spin_unlock_irqrestore(&sbi->page_lock, flags); + } else { + local_irq_save(flags); + + list_add(&node->node, &ppool->page_list); + ppool->npages++; + + local_irq_restore(flags); + } +} + +static void _unset_bitmap(struct eufs_sb_info *sbi, u64 addr, bool flush); +static void return_cl(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + unsigned long flags, flags2; + u64 page_no; + u64 page_off; + int i; + struct ptr_list_node *tmp; + + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + spin_lock_irqsave(&sbi->rest_lock, flags); + list_add(&node->node, &sbi->rest_pool->line_list); + sbi->rest_pool->nlines++; + spin_unlock_irqrestore(&sbi->rest_lock, flags); + } else if (ppool->nlines >= LOCAL_LINE_MAX) { + page_off = (node->ptr - sbi->data_start); + page_no = page_off / PAGE_SIZE; + page_off = page_off % PAGE_SIZE; + + spin_lock_irqsave(&sbi->line_lock, flags2); + /* line_indicators are protected by sbi->line_lock */ + if (++sbi->line_indicators[page_no] == 63) { + /* Remove all cache lines */ + for (i = 1; i < 64; ++i) { + tmp = &sbi->line_node_ptrs[page_no][i]; + if (tmp == node) + continue; + list_del(&tmp->node); + /* It must be !solid since we ensure it during nvfree */ + BUG_ON(tmp->solid); + --sbi->gpool->nlines; + } + spin_unlock_irqrestore(&sbi->line_lock, flags2); + eufs_dbg("! cacheline coalescence !\n"); + + /* Add back a whole page */ + tmp = &sbi->cached_nodes[page_no]; + BUG_ON(!tmp->solid); + _unset_bitmap(sbi, (u64)tmp->ptr, true); + _SET_NON_BUSY(tmp, "fault addr %px", tmp->ptr); + + spin_lock_irqsave(&sbi->page_lock, flags); + list_add(&tmp->node, &sbi->gpool->page_list); + sbi->gpool->npages++; + sbi->page_map[page_no] = EUFS_PAGE_FREE; + + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return; + } + + list_add(&node->node, &sbi->gpool->line_list); + sbi->gpool->nlines++; + spin_unlock_irqrestore(&sbi->line_lock, flags2); + + } else { + list_add(&node->node, &ppool->line_list); + ppool->nlines++; + } +} + +static void return_line4(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + + } else if (ppool->nlines >= LOCAL_LINE_MAX) { + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + + } else { + list_add(&node->node, &ppool->line4_list); + ppool->nline4s++; + } +} + +void nv_fini(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + int i; + + vfree(sbi->cached_nodes); + for (i = 0; i < sbi->npages; ++i) + if (sbi->line_node_ptrs[i]) + eufs_free_page(sbi->line_node_ptrs[i]); + vfree(sbi->line_node_ptrs); + vfree(sbi->line_indicators); + + free_percpu(sbi->ppool); + kfree(sbi->rest_pool); + kfree(sbi->gpool); +} + +void nv_init(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + int cpu; + + /* allocate pools */ + sbi->gpool = kmalloc(sizeof(struct mem_pool), GFP_KERNEL); + INIT_LIST_HEAD(&sbi->gpool->large_list); + INIT_LIST_HEAD(&sbi->gpool->page_list); + INIT_LIST_HEAD(&sbi->gpool->line4_list); + INIT_LIST_HEAD(&sbi->gpool->line_list); + sbi->gpool->nlarges = 0; + sbi->gpool->npages = 0; + sbi->gpool->nline4s = 0; + sbi->gpool->nlines = 0; + + sbi->rest_pool = kmalloc(sizeof(struct mem_pool), GFP_KERNEL); + INIT_LIST_HEAD(&sbi->rest_pool->large_list); + INIT_LIST_HEAD(&sbi->rest_pool->page_list); + INIT_LIST_HEAD(&sbi->rest_pool->line4_list); + INIT_LIST_HEAD(&sbi->rest_pool->line_list); + + sbi->rest_pool->nlarges = 0; + sbi->rest_pool->npages = 0; + sbi->rest_pool->nline4s = 0; + sbi->rest_pool->nlines = 0; + + sbi->ppool = alloc_percpu(struct mem_pool); + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + INIT_LIST_HEAD(&ppool->large_list); + INIT_LIST_HEAD(&ppool->page_list); + INIT_LIST_HEAD(&ppool->line4_list); + INIT_LIST_HEAD(&ppool->line_list); + ppool->nlarges = 0; + ppool->npages = 0; + ppool->nline4s = 0; + ppool->nlines = 0; + ppool->fetch_count = FETCH_COUNT; + } + + partition(sb, init); +} + +static int cut_from_list_remaining(struct list_head *head, int remaining, + struct list_head *tmp) +{ + int i = 0; + struct list_head *end; + struct list_head *sentry; + + if (list_empty(head)) + return 0; + end = head; + sentry = head; + for (i = 0; i < remaining; ++i) { + if (sentry->next == head) + /* too few */ + return 0; + sentry = sentry->next; + } + + for (i = 0; sentry->next != head; ++i) { + end = end->next; + sentry = sentry->next; + } + + INIT_LIST_HEAD(tmp); + list_cut_position(tmp, head, end); + return i; +} + +static void give_up_pages(void *info) +{ + struct eufs_sb_info *sbi = info; + unsigned long flags, flags2; + LIST_HEAD(tmp); + struct mem_pool *ppool; + int i = 0; + int cpu; + + cpu = get_cpu(); + local_irq_save(flags2); + /* Need a way to get it back */ + ppool = per_cpu_ptr(sbi->ppool, cpu); + ppool->fetch_count = 10; + + i = cut_from_list_remaining(&ppool->page_list, ppool->fetch_count, + &tmp); + + if (i) { + spin_lock_irqsave(&sbi->page_lock, flags); + list_splice_tail(&tmp, &sbi->gpool->page_list); + sbi->gpool->npages += i; + spin_unlock_irqrestore(&sbi->page_lock, flags); + + ppool->npages -= i; + } + + i = cut_from_list_remaining(&ppool->large_list, 1, &tmp); + if (i) { + spin_lock_irqsave(&sbi->large_lock, flags); + list_splice_tail(&tmp, &sbi->gpool->large_list); + sbi->gpool->nlarges += i; + spin_unlock_irqrestore(&sbi->large_lock, flags); + + ppool->nlarges -= i; + } + + local_irq_restore(flags2); + put_cpu(); +} + +void revive_rest_pool(struct eufs_sb_info *sbi); + +static void gather_pages(struct eufs_sb_info *sbi) +{ + smp_call_func_t func = give_up_pages; + unsigned long flags; + + /* Gather from other CPUs */ + mutex_lock(&sbi->gather_mutex); + + smp_call_function(func, sbi, true); + + mutex_unlock(&sbi->gather_mutex); + + /* Gather from rest pool, if necessary */ + spin_lock_irqsave(&sbi->page_lock, flags); + if (!list_empty(&sbi->gpool->page_list)) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return; + } + spin_unlock_irqrestore(&sbi->page_lock, flags); + + revive_rest_pool(sbi); + /* I've tried the best */ +} + +static bool reload_lines_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool) +{ + struct ptr_list_node *node; + struct list_head *head; + struct list_head *end; + unsigned long flags; + LIST_HEAD(tmp); + int i; + + spin_lock_irqsave(&sbi->line_lock, flags); + head = &sbi->gpool->line_list; + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->line_lock, flags); + return false; + } + end = head; + + /* head is not a legal node */ + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) { + end = end->next; + node = list_entry(end, struct ptr_list_node, node); + /* move out of global pool */ + --sbi->line_indicators[(node->ptr - sbi->data_start) / + PAGE_SIZE]; + } + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->line_list); + + sbi->gpool->nlines -= i; + ppool->nlines += i; + spin_unlock_irqrestore(&sbi->line_lock, flags); + + return true; +} + +static bool reload_large_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool, bool nonblocking) +{ + struct list_head *head; + struct list_head *end; + LIST_HEAD(tmp); + int i; + unsigned long flags; + + spin_lock_irqsave(&sbi->large_lock, flags); + + if (nonblocking) { + if (sbi->gpool->nlarges == 0) { + spin_unlock_irqrestore(&sbi->large_lock, flags); + return false; + } + } else { + /* blocking is okay */ + if (sbi->gpool->nlarges <= NR_RESERVED_PAGES) { + spin_unlock_irqrestore(&sbi->large_lock, flags); + return false; + } + } + head = &sbi->gpool->large_list; + end = head; + + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) + end = end->next; + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->large_list); + + sbi->gpool->nlarges -= i; + ppool->nlarges += i; + + spin_unlock_irqrestore(&sbi->large_lock, flags); + + return true; +} + +static bool reload_page_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool, bool nonblocking) +{ + struct list_head *head; + struct list_head *end; + LIST_HEAD(tmp); + int i; + unsigned long flags; + + spin_lock_irqsave(&sbi->page_lock, flags); + + if (nonblocking) { + if (sbi->gpool->npages == 0) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return false; + } + } else { + /* blocking is okay */ + if (sbi->gpool->npages <= NR_RESERVED_PAGES) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return false; + } + } + head = &sbi->gpool->page_list; + end = head; + + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) + end = end->next; + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->page_list); + + sbi->gpool->npages -= i; + ppool->npages += i; + + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return true; +} + +void revive_rest_pool(struct eufs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + spin_lock(&sbi->line_lock); + spin_lock(&sbi->rest_lock); + + list_splice_init(&sbi->rest_pool->large_list, &sbi->gpool->large_list); + list_splice_init(&sbi->rest_pool->page_list, &sbi->gpool->page_list); + list_splice_init(&sbi->rest_pool->line_list, &sbi->gpool->line_list); + sbi->gpool->nlarges += sbi->rest_pool->nlarges; + sbi->gpool->npages += sbi->rest_pool->npages; + sbi->gpool->nlines += sbi->rest_pool->nlines; + sbi->rest_pool->nlarges = 0; + sbi->rest_pool->npages = 0; + sbi->rest_pool->nlines = 0; + + spin_unlock(&sbi->rest_lock); + spin_unlock(&sbi->line_lock); + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); +} + +static __always_inline int cut_from_list(struct list_head *head, + struct list_head *list, int count) +{ + struct list_head *end = head; + int i; + + for (i = 0; i < count && end->next != head; ++i) + end = end->next; + list_cut_position(list, head, end); + return i; +} + +static void preallocate_pages_from_larges_and_pages(struct eufs_sb_info *sbi, + struct alloc_batch *ab, + size_t count, + struct mem_pool *pool) +{ + struct ptr_list_node *list_node; + long nlarges_needed; + size_t r = 0; + int i; + + WARN(!irqs_disabled(), "Interrupt is not disabled!"); + + WARN(count > pool->nlarges * 512 + pool->npages, + "Invarients violated!"); + + if (count <= pool->npages) { + r = cut_from_list(&pool->page_list, &ab->list, count); + pool->npages -= r; + WARN_ON(r != count); + return; + } + + nlarges_needed = DIV_ROUND_UP(count - pool->npages, 512); + if ((nlarges_needed * 512) < count) { + r = cut_from_list(&pool->page_list, &ab->list, + count - (nlarges_needed * 512)); + WARN_ON(r != count - (nlarges_needed * 512)); + pool->npages -= r; + } + while (nlarges_needed--) { + list_node = list_first_entry(&pool->large_list, + struct ptr_list_node, node); + list_del(&list_node->node); + pool->nlarges--; + list_node->multiple = false; + /* split the large page */ + for (i = 0; i < 512; ++i) { + if (r < count) { + list_add(&list_node->node, &ab->list); + } else { + /* + * When all requested pages come from splitting of + * large pages, the remaining pages needs to add + * the list of normal page + */ + list_add(&list_node->node, &pool->page_list); + pool->npages++; + } + + r++; + list_node++; + } + } +} + +static int preallocate_page_from_pool(struct eufs_sb_info *sbi, + struct alloc_batch *ab, size_t count, + struct mem_pool *ppool) +{ + BUG_ON(!list_empty(&ab->list)); + BUG_ON(count > ppool->nlarges * 512 + ppool->npages); + + /* get locally with large pages and pages */ + preallocate_pages_from_larges_and_pages(sbi, ab, count, ppool); + + return 0; +} + +static int preallocate_page_from_gpool(struct eufs_sb_info *sbi, + struct alloc_batch *ab, size_t count) +{ + unsigned long flags; + u64 nlarges_avail = 0; + u64 npages_avail = 0; + + BUG_ON(!list_empty(&ab->list)); + + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + /* enough pages are available? */ + /* + * We have NR_RESERVED_PAGES pages reserved for allocation in page fault + * handlers, so do not use reserved pages if we can gather from other + * CPUs. + * NOTICE: We'd better not to use minus here since sbi->gpool->npages is + * unsigned. + */ + if (sbi->gpool->nlarges > NR_RESERVED_PAGES) + nlarges_avail = sbi->gpool->nlarges - NR_RESERVED_PAGES; + if (sbi->gpool->npages > NR_RESERVED_PAGES) + npages_avail = sbi->gpool->npages - NR_RESERVED_PAGES; + + if (count > nlarges_avail * 512 + npages_avail) { + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + /* unlock and gather page */ + gather_pages(sbi); + /* relock after the gathering */ + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + /* enough pages this time? */ + if (count > sbi->gpool->nlarges * 512 + sbi->gpool->npages) { + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + return -ENOSPC; + } + } + + /* get locally with large pages and pages */ + preallocate_pages_from_larges_and_pages(sbi, ab, count, sbi->gpool); + + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return 0; +} + +void *nvmalloc_pre_get_from_list(struct super_block *sb, struct list_head *list, + u8 tag) +{ + struct ptr_list_node *list_node = + list_first_entry(list, struct ptr_list_node, node); + void __pmem *page = list_node->ptr; + + list_del(&list_node->node); + list_node->tag = tag; + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", page); + + eufs_dbg("nvallocate pre-from-list: %px bitmap=%d busy=%d\n", page, + EUFS_SB(sb)->page_map[(page - EUFS_SB(sb)->data_start) / + PAGE_SIZE], + EUFS_SB(sb) + ->cached_nodes[(page - EUFS_SB(sb)->data_start) / + PAGE_SIZE] + .busy); + return page; +} + +int nvmalloc_pre(struct super_block *sb, struct alloc_batch *ab, size_t count, + size_t size) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + unsigned long flags; + int cpu; + int r; + /* size other than PAGE_SIZE not supported currently */ + if (size != PAGE_SIZE) + return -EOPNOTSUPP; + + cpu = get_cpu(); + local_irq_save(flags); + + ppool = per_cpu_ptr(sbi->ppool, cpu); + if (count <= ppool->nlarges * 512 + ppool->npages) { + /* get locally */ + r = preallocate_page_from_pool(sbi, ab, count, ppool); + local_irq_restore(flags); + put_cpu(); + return r; + } + + /* get from global pool */ + local_irq_restore(flags); + put_cpu(); + r = preallocate_page_from_gpool(sbi, ab, count); + return r; +} + +/* + * Large: 2M + * Page: 4K + * Line4: 256B + * Line: 64B + */ + +#define LARGE_PAGE_SIZE (2 << 20) + +/* + * get from ppool list, then from the global list if present, + * if failed, break larger units. + */ +static void *try_get_large_page(struct eufs_sb_info *sbi, + struct mem_pool *ppool, u8 tag, + bool nonblocking) +{ + struct ptr_list_node *list_node; + void *ret; + unsigned long flags; + +retry: + if (list_empty(&ppool->large_list) && + !reload_large_from_gpool(sbi, ppool, nonblocking)) + return NULL; + + local_irq_save(flags); + if (list_empty(&ppool->large_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = list_first_entry(&ppool->large_list, struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + ppool->nlarges--; + list_node->tag = tag; + + local_irq_restore(flags); + + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", ret); + + BUG_ON(((u64)ret % LARGE_PAGE_SIZE)); + + return ret; +} + +static void *eufs_try_get_page(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + struct ptr_list_node *node; + void *ret; + unsigned long flags; + void *large; + int i; + u64 page_no; + +retry: + if (list_empty(&ppool->page_list)) { + /* slow path */ + if (!reload_page_from_gpool(sbi, ppool, use_reserved)) { + /* TODO: merge pages back to large pages? */ + large = try_get_large_page(sbi, ppool, 0, use_reserved); + if (!large) + return NULL; + page_no = (large - sbi->data_start) / PAGE_SIZE; + for (i = 1; i < 512; ++i) { + node = &sbi->cached_nodes[page_no + i]; + node->multiple = false; + return_page(sbi, ppool, node, false); + } + sbi->cached_nodes[page_no].multiple = false; + sbi->cached_nodes[page_no].tag = tag; + return large; + } + } + local_irq_save(flags); + if (list_empty(&ppool->page_list)) { + local_irq_restore(flags); + goto retry; + } + list_node = + list_first_entry(&ppool->page_list, struct ptr_list_node, node); + + ret = list_node->ptr; + list_del(&list_node->node); + ppool->npages--; + list_node->tag = tag; + + local_irq_restore(flags); + + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", ret); + + BUG_ON(((u64)ret % PAGE_SIZE)); + + return ret; +} + +/* NOTICE: cpu changes in this function */ +static struct ptr_list_node *split_page_to_lines(struct eufs_sb_info *sbi, + struct mem_pool *ppool, + void *page, bool use_line4) +{ + struct ptr_list_node *node, *ret = NULL; + u64 page_no; + int cpu; + int i; + /* Release the cpu since may need to allocate a page. */ + put_cpu(); + + /* Split the page */ + page_no = (page - sbi->data_start) / PAGE_SIZE; + sbi->line_indicators[page_no] = 0; + + if (sbi->line_node_ptrs[page_no]) { + memclr(sbi->line_node_ptrs[page_no], PAGE_SIZE); + } else { + sbi->line_node_ptrs[page_no] = eufs_zalloc_page(); + BUG_ON(!sbi->line_node_ptrs[page_no]); + } + memclr(page, CACHELINE_SIZE); + + /* cache line 0: bitmap */ + /* cache line 1~3: insert to line_list */ + /* cache line >4: insert to line4_list */ + + /* + * Reget the cpu. The cpu might be different from the + * one we previously got, but it doesn't matter. + */ + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + for (i = 1; i < 64; ++i) { + node = &sbi->line_node_ptrs[page_no][i]; + node->ptr = page + i * CACHELINE_SIZE; + node->busy = false; + node->solid = false; + node->multiple = false; + } + for (i = 1; i < 4; ++i) { + node = &sbi->line_node_ptrs[page_no][i]; + if (!use_line4 && i == 1) { + ret = node; + continue; + } + return_cl(sbi, ppool, node, false); + } + for (i = 4; i < 64; i += 4) { + node = &sbi->line_node_ptrs[page_no][i]; + node->multiple = true; + if (use_line4 && i == 4) { + ret = node; + continue; + } + return_line4(sbi, ppool, node, false); + } + return ret; +} + +static void *try_get_line4(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + unsigned long flags; + void *ret; + +retry: + /* cache line x 4 */ + if (list_empty(&ppool->line4_list)) { + /* Cannot fetch cache lines from gpool, get from page */ + ret = eufs_try_get_page(sbi, ppool, 0, use_reserved); + if (ret == NULL) + return NULL; + + list_node = split_page_to_lines(sbi, ppool, ret, true); + ret = list_node->ptr; + list_node->tag = tag; + goto out; + } + + local_irq_save(flags); + if (list_empty(&ppool->line4_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = list_first_entry(&ppool->line4_list, struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + + ppool->nline4s--; + list_node->tag = tag; + + local_irq_restore(flags); +out: + + _SET_BUSY(list_node, "error cacheline addr=%px", ret); + + return ret; +} + +static void *try_get_line(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + struct ptr_list_node *node; + unsigned long flags; + void *ret; + int k; + +retry: + /* cache line x 1 */ + if (list_empty(&ppool->line_list)) { + /* Fetch cache lines from gpool */ + if (!reload_lines_from_gpool(sbi, ppool) /* slow path */) { + if (list_empty(&ppool->line4_list)) { + ret = eufs_try_get_page(sbi, ppool, 0, + use_reserved); + if (ret == NULL) + return NULL; + + list_node = split_page_to_lines(sbi, ppool, ret, + false); + ret = list_node->ptr; + list_node->tag = tag; + goto out; + } else { + local_irq_save(flags); + if (list_empty(&ppool->line4_list)) { + local_irq_restore(flags); + goto retry; + } + list_node = + list_first_entry(&ppool->line4_list, + struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + ppool->nline4s--; + list_node->tag = tag; + + list_node->multiple = false; + + for (k = 1; k < 4; ++k) { + node = list_node + k; + node->multiple = false; + list_add(&node->node, + &ppool->line_list); + ppool->nlines++; + } + local_irq_restore(flags); + goto out; + } + } + } + + local_irq_save(flags); + if (list_empty(&ppool->line_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = + list_first_entry(&ppool->line_list, struct ptr_list_node, node); + ret = list_node->ptr; + list_del(&list_node->node); + + ppool->nlines--; + list_node->tag = tag; + + local_irq_restore(flags); +out: + + _SET_BUSY(list_node, "error cacheline addr=%px", ret); + + return ret; +} + +/* + * If nonblocking is set, we will skip the gather phase and allocate from the + * reserved pages (in gpool) + */ +void *nvmalloc(struct super_block *sb, size_t size, u8 tag, bool nonblocking) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + void __pmem *ret; + int cpu; + u64 npages; + u64 nlines; + bool once_gathered = false; + void *(*try_get_)(struct eufs_sb_info *sbi, struct mem_pool *mp, u8 tag, + bool use_reserved); + + if (size == PAGE_SIZE << 9) { + try_get_ = try_get_large_page; + } else if (size == PAGE_SIZE) { + try_get_ = eufs_try_get_page; + } else if (size == CACHELINE_SIZE << 2) { + try_get_ = try_get_line4; + } else if (size == CACHELINE_SIZE) { + try_get_ = try_get_line; + } else { + WARN(1, "EulerFS: INVALID allocation size!"); + return NULL; + } + +gathered_retry: + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + /* + * If we have gathered, we must try our best to allocate, so + * even the reserved pages can be used + */ + ret = try_get_(sbi, ppool, tag, nonblocking || once_gathered); + + if (ret == NULL) { + if (once_gathered || nonblocking) + /* Really full */ + goto full_out; + /* Maybe full. Try gather from other CPUs. */ + put_cpu(); + gather_pages(sbi); + once_gathered = true; + goto gathered_retry; + } + put_cpu(); + + eufs_dbg("nvallocate: %px bitmap=%d busy=%d @cpu=%d\n", ret, + sbi->page_map[(ret - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ret - sbi->data_start) / PAGE_SIZE].busy, + cpu); + + WARN_ON(ret == NULL); + return ret; +full_out: + put_cpu(); + nv_stat(sbi, &npages, &nlines); + pr_warn_ratelimited("EulerFS is FULL! @%d (%lld pages, %lld lines)\n", + smp_processor_id(), npages, nlines); + return NULL; +} + +static void _unset_bitmap(struct eufs_sb_info *sbi, u64 addr, bool flush) +{ + u64 page_no = (addr - (u64)sbi->data_start) / PAGE_SIZE; + u64 rem = addr % PAGE_SIZE; + line_info_t __pmem *line_map; + struct ptr_list_node *node; + int line_no; + + node = sbi->cached_nodes + (page_no); + if (rem == 0) { + /* + * the nvmalloc->nvfree case should be handled when nolde->solid + * is false if the allocation is implemented. Same as below. + */ + if (node->solid) { + BUG_ON(sbi->page_map[page_no] == EUFS_PAGE_FREE); + sbi->page_map[page_no] = EUFS_PAGE_FREE; + if (flush) + eufs_flush_cacheline(&sbi->page_map[page_no]); + } + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE); + node->solid = false; + } else { + /* line */ + BUG_ON(rem % CACHELINE_SIZE != 0); + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE && + sbi->page_map[page_no] != EUFS_PAGE_LINE_USED); + + if (!node->solid) { + /* the allocation is not written yet */ + /* HACK: idempotent */ + if (sbi->page_map[page_no] != EUFS_PAGE_LINE_USED) { + sbi->page_map[page_no] = EUFS_PAGE_LINE_USED; + eufs_flush_cacheline(&sbi->page_map[page_no]); + } + node->solid = true; + } + + node = &sbi->line_node_ptrs[page_no][rem / CACHELINE_SIZE]; + line_map = (void *)(addr - rem); + line_no = rem / CACHELINE_SIZE; + + if (node->solid) { + BUG_ON(line_map[line_no] == EUFS_LINE_FREE); + line_map[line_no] = EUFS_LINE_FREE; + eufs_dbg("unset %px[%d] = 0\n", line_map, line_no); + + if (flush) + eufs_flush_cacheline(&line_map[line_no]); + } + + node->solid = false; + BUG_ON(line_map[line_no] != EUFS_LINE_FREE); + } +} + +void nvfree(struct super_block *sb, void *ptr, bool rest) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + struct ptr_list_node *node; + s64 offset; + int cpu; + u64 end = sbi->npages * PAGE_SIZE; + + if (ptr == NULL_ADDR_PTR) + return; + + offset = ptr - sbi->data_start; + BUG_ON(offset < 0); + BUG_ON(offset >= end); + + eufs_dbg("%s: %px bitmap=%d busy=%d\n", __func__, ptr, + sbi->page_map[(ptr - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ptr - sbi->data_start) / PAGE_SIZE].busy); + + _unset_bitmap(sbi, (u64)ptr, true); + + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + if ((u64)ptr % PAGE_SIZE == 0) { + /* page */ + + /* get node */ + node = sbi->cached_nodes + offset / PAGE_SIZE; + node->ptr = ptr; + _SET_NON_BUSY(node, "fault addr %px", ptr); + /* add to page-to-free list */ + if (node->multiple) + WARN_ON_ONCE(1); + else + return_page(sbi, ppool, node, rest); + } else if ((u64)ptr % CACHELINE_SIZE == 0) { + /* cache line */ + + /* get node */ + node = &sbi->line_node_ptrs[offset / PAGE_SIZE] + [offset % PAGE_SIZE / CACHELINE_SIZE]; + _SET_NON_BUSY(node, "fault addr %px", ptr); + /* add to local cl pool */ + if (node->multiple) + return_line4(sbi, ppool, node, rest); + else + return_cl(sbi, ppool, node, rest); + } else { + /* error */ + eufs_warn("!err allocation type!\n"); + } + put_cpu(); + eufs_dbg("%s done: %px bitmap=%d busy=%d\n", __func__, ptr, + sbi->page_map[(ptr - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ptr - sbi->data_start) / PAGE_SIZE].busy); +} diff --git a/fs/eulerfs/nvalloc.h b/fs/eulerfs/nvalloc.h new file mode 100644 index 000000000000..a39b81862bfb --- /dev/null +++ b/fs/eulerfs/nvalloc.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_NVALLOC_H +#define EUFS_NVALLOC_H + +#include "euler.h" +#include "euler_common.h" +#include <linux/sort.h> +#include <linux/vmalloc.h> + +/* + * The design of the allocator is hybrid, and the in-dram allocators are + * per-cpu to accelerate the zalloc/zfree process. + * + * This allocator supports only 4K and 64B (one cache line) so no chunks + * are involved in the design. + */ +/* + * In NVM +---> 0/1: 4K/64B + * +-----------------+ | + * | page-map *+-->+----+---1B-in-size+- + * | (page-info) | | type | birth_gen | + * +-----------------+ +------------------+ + * | pages | | type2| check_gen | + * | (actual data)* *| +------------------+ + * +--------------+-++ + * | | + * | +---> +-------------------------+ + * v | gen x 63 + lock/cpu x 1 | + * +-----------+ +-------------------------+ + * | | | cache line data 1 | + * | 4K page | +-------------------------+ + * | data | | cache line data 2 | + * | | +-------------------------+ + * +-----------+ | ... | + * +-------------------------+ + * | cache line data 63 | + * +-------------------------+ + */ + +/* + * In DRAM, percpu + * +----------------+ + * | free-page-list +----->... + * +----------------+ + * | free-line-list +----->... + * +----------------+ + * + * global + * +----------------+ + * | free-page-lists+----> free-page-list --> free-page-list + * +----------------+ + * | free-line-lists +----> free-line-list --> free-line-list + * +----------------+ + */ + +extern int wear_control; +extern int wear_alloc_threshold; + +struct ptr_list_node { + struct list_head node; /* points to next list node */ + void __pmem *ptr; + bool busy; /* whether it is allocated in the volatile allocator */ + bool solid; /* whether it is allocated in the bitmap */ + bool multiple; /* whther it is a linex4/large-page */ + u8 tag; + int counter; /* How many times has it been allocated? */ +}; + +struct mem_pool { + struct list_head page_list; /* points to ptr_lists_node */ + struct list_head line_list; /* points to ptr_lists_node */ + struct list_head line4_list; + struct list_head large_list; + u64 npages; + u64 nlines; + u64 nline4s; + u64 nlarges; + int fetch_count; +}; + +#define _SET_NON_BUSY(node, fmt, args...) \ + do { \ + if (node->busy == false) { \ + eufs_info(fmt, ##args); \ + BUG(); \ + } \ + node->busy = false; \ + } while (0) + +#define _SET_BUSY(node, fmt, args...) \ + do { \ + if (node->busy == true) { \ + eufs_info(fmt, ##args); \ + BUG(); \ + } \ + node->busy = true; \ + } while (0) + +#define EUFS_PAGE_FREE (0) +#define EUFS_PAGE_USED (1) +#define EUFS_PAGE_LINE_USED (2) +#define EUFS_PAGE_LARGE_USED (3) +#define EUFS_PAGE_RESERVED (5) + +#define EUFS_LINE_FREE (0) +#define EUFS_LINE_USED (1) +#define EUFS_LINE4_USED (2) + +#define EUFS_PAGE_FILE_DATA (8) +#define EUFS_PAGE_FILE_INDEX (9) +#define EUFS_PAGE_HTABLE (10) +#define EUFS_PAGE_SYMLINK (11) +#define EUFS_PAGE_INODE_EXT (12) + +#define EUFS_LINE4_INODE (4) +#define EUFS_LINE_DENTRY (5) +#define EUFS_LINE_NAME_EXT (6) + +void *nvmalloc_pre_get_from_list(struct super_block *sb, struct list_head *list, + u8 tag); +int nvmalloc_pre(struct super_block *sb, struct alloc_batch *ab, size_t count, + size_t size); +void *nvmalloc(struct super_block *sb, size_t size, u8 tag, bool nonblocking); +void nvfree(struct super_block *sb, void *ptr, bool rest); +void nv_init(struct super_block *sb, bool init); +void nv_fini(struct super_block *sb); +void eufs_get_layout(struct super_block *sb, bool init); + +#define FETCH_COUNT 64 +#define EUFS_PRE_PAGES_PERCPU (4096) + +#define LOCAL_PAGE_MAX (4096 * 8) +#define LOCAL_LINE_MAX (4096) + +#define NR_RESERVED_PAGES (64) + +static __always_inline void print_line_map(line_info_t *line_map, u8 line_num) +{ + int i; + + eufs_info("line_map[line_num]: %px[%u]=%u\n", line_map, line_num, + line_map[line_num]); + eufs_info("line_map=%px ===>\n", line_map); + for (i = 0; i < 8; ++i) { + int i8 = i * 8; + + eufs_info("%d: %u %u %u %u %u %u %u %u\n", i, line_map[i8 + 0], + line_map[i8 + 1], line_map[i8 + 2], line_map[i8 + 3], + line_map[i8 + 4], line_map[i8 + 5], line_map[i8 + 6], + line_map[i8 + 7]); + } +} + +static __always_inline void nv_stat(struct eufs_sb_info *sbi, u64 *page, + u64 *line) +{ + struct mem_pool *ppool; + u64 nlarges = sbi->gpool->nlarges; + u64 npages = sbi->gpool->npages; + u64 nline4s = 0; + u64 nlines = sbi->gpool->nlines; + int cpu; + + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + + nlarges += ppool->nlarges; + npages += ppool->npages; + nline4s += ppool->nline4s; + nlines += ppool->nlines; + } + *page = npages + (nlarges << 9); + *line = nlines + (nline4s << 2); +} + +static __always_inline void print_stats(struct eufs_sb_info *sbi) +{ + struct mem_pool *ppool; + int cpu; + u64 nlarges = sbi->gpool->nlarges; + u64 npages = sbi->gpool->npages; + u64 nline4s = 0; + u64 nlines = sbi->gpool->nlines; + + eufs_info("Stat: (g,%lld,%lld), ", sbi->gpool->npages, + sbi->gpool->nlines); + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + + nlarges += ppool->nlarges; + npages += ppool->npages; + nline4s += ppool->nline4s; + nlines += ppool->nlines; + + eufs_info("(@%d,%lld,%lld,%lld,%lld) ", cpu, ppool->nlarges, + ppool->npages, ppool->nline4s, ppool->nlines); + } + eufs_info("= (summary: larges=%lld pages=%lld line4s=%lld lines=%lld)\n", + nlarges, npages, nline4s, nlines); +} + +#endif /* EUFS_NVALLOC_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement interfaces to flush cacheline, page and buffer.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/flush.h | 171 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 fs/eulerfs/flush.h
diff --git a/fs/eulerfs/flush.h b/fs/eulerfs/flush.h new file mode 100644 index 000000000000..9baa2b533196 --- /dev/null +++ b/fs/eulerfs/flush.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_FLUSH_H +#define EUFS_FLUSH_H + +#ifdef CONFIG_X86_64 +static __always_inline bool arch_has_clwb(void) +{ + return static_cpu_has(X86_FEATURE_CLWB); +} + +static __always_inline bool arch_has_clflushopt(void) +{ + return static_cpu_has(X86_FEATURE_CLFLUSHOPT); +} + +static __always_inline bool arch_has_clflush(void) +{ + return static_cpu_has(X86_FEATURE_CLFLUSH); +} + +static __always_inline bool arch_has_rtm(void) +{ + return static_cpu_has(X86_FEATURE_RTM); +} + +static __always_inline void __sfence(void) +{ + asm volatile("sfence\n" : : : "memory"); +} + +static inline void _mm_clflush(const void *addr) +{ + asm volatile("clflush %0" : "+m"(*(volatile char *)(addr))); +} + +static inline void _mm_clflushopt(const void *addr) +{ + asm volatile(".byte 0x66; clflush %0" : "+m"(*(volatile char *)(addr))); +} + +static inline void _mm_clwb(const void *addr) +{ + asm volatile(".byte 0x66; xsaveopt %0" : "+m"(*(volatile char *)(addr))); +} + +#else +static __always_inline bool arch_has_clwb(void) +{ + return false; +} + +static __always_inline bool arch_has_clflushopt(void) +{ + return false; +} + +static __always_inline bool arch_has_clflush(void) +{ + return false; +} + +static __always_inline bool arch_has_rtm(void) +{ + return false; +} + +static __always_inline void __sfence(void) +{ + /* arm64 doesn't support sfence */ + smp_mb(); +} + +#define _mm_clflush(addr) do {} while (0) +#define _mm_clflushopt(addr) do {} while (0) +#define _mm_clwb(addr) do {} while (0) +#endif + +extern int support_rtm; +extern int support_clwb; +extern int support_clflushopt; +extern int support_clflush; +extern int clflush_delay; +extern int force_nocache_write; +extern int max_dirty_inodes; +extern int max_dep_nodes; + +static __always_inline void eufs_sfence(void) +{ + __sfence(); +} + +static __always_inline void eufs_pbarrier(void) +{ + if (support_clwb || support_clflushopt) + eufs_sfence(); +} + +static __always_inline void eufs_flush_cacheline(const void *ptr) +{ + if (support_clwb) + _mm_clwb(ptr); + else if (support_clflushopt) + _mm_clflushopt(ptr); + else if (support_clflush) + _mm_clflush(ptr); +} + +static __always_inline void eufs_flush_page(const void *ptr) +{ + uint32_t i; + + if (support_clwb) { + for (i = 0; i < PAGE_SIZE; i += CACHELINE_SIZE) + _mm_clwb(ptr + i); + } else if (support_clflushopt) { + for (i = 0; i < PAGE_SIZE; i += CACHELINE_SIZE) + _mm_clflushopt(ptr + i); + } else if (support_clflush) { + for (i = 0; i < PAGE_SIZE; i += CACHELINE_SIZE) + _mm_clflush(ptr + i); + } +} + +static __always_inline void eufs_flush_buffer(const void *buf, uint32_t len, + bool fence) +{ + uint32_t i; + uint32_t aligned_len = + len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1)); + + if (support_clwb) { + for (i = 0; i < aligned_len; i += CACHELINE_SIZE) + _mm_clwb(buf + i); + } else if (support_clflushopt) { + for (i = 0; i < aligned_len; i += CACHELINE_SIZE) + _mm_clflushopt(buf + i); + } else if (support_clflush) { + for (i = 0; i < aligned_len; i += CACHELINE_SIZE) { + /* flush the cache line that contains the address (buf + i) */ + _mm_clflush(buf + i); + } + } + + /* + * Do a fence only if asked. We often don't need to do a fence + * immediately after clflush because even if we get context switched + * between clflush and subsequent fence, the context switch operation + * provides implicit fence. + */ + if (fence) + eufs_sfence(); +} + +static __always_inline void eufs_flush_range(const void *ptr, uint32_t len) +{ + eufs_flush_buffer(ptr, len, false); +} + +#endif /* EUFS_FLUSH_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/lock.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 fs/eulerfs/lock.h
diff --git a/fs/eulerfs/lock.h b/fs/eulerfs/lock.h new file mode 100644 index 000000000000..6c4fe7734ac6 --- /dev/null +++ b/fs/eulerfs/lock.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_LOCK_H +#define EUFS_LOCK_H + +#include "euler_def.h" + +static inline void eufs_inode_mark_lock_transferable(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + + vi->i_lock_transferred = I_TRANS_AVAIL; +} + +static inline void eufs_inode_wait_lock_transfer_done(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + + while (cmpxchg(&vi->i_lock_transferred, I_TRANS_AVAIL, I_TRANS_NONE) != + I_TRANS_AVAIL) + cond_resched(); +} + +/* return true on success, false on failure */ +static inline bool eufs_inode_mark_lock_transferring(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + + return (cmpxchg(&vi->i_lock_transferred, I_TRANS_AVAIL, + I_TRANS_LOCKED) == I_TRANS_AVAIL); +} + +static inline void eufs_inode_lock_transfer_done(struct inode *inode) +{ + eufs_inode_mark_lock_transferable(inode); +} + +#endif /* EUFS_LOCK_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Page wears are preallocated ints for all pages, each of the ints records the number of writes to the page. This is used to coarse-grainedly show the degree of wear.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/wear.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/wear.h | 30 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 fs/eulerfs/wear.c create mode 100644 fs/eulerfs/wear.h
diff --git a/fs/eulerfs/wear.c b/fs/eulerfs/wear.c new file mode 100644 index 000000000000..3535efab9fa8 --- /dev/null +++ b/fs/eulerfs/wear.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "wear.h" +#include <linux/vmalloc.h> +#include "euler.h" + +void wear_init(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + if (!wear_control) + return; + sbi->page_wears = vmalloc(sizeof(struct page_wear) * sbi->npages); + memset(sbi->page_wears, 0, sizeof(struct page_wear) * sbi->npages); +} + +void wear_fini(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + if (!wear_control) + return; + if (sbi->page_wears) + vfree(sbi->page_wears); + sbi->page_wears = NULL; +} + +/* Return whether it's in a good state */ +bool wear_inc(struct super_block *sb, void *page) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + if (!wear_control) + return true; + return sbi->page_wears[(page - sbi->data_start) / PAGE_SIZE].wear++ <= + wear_threshold; +} diff --git a/fs/eulerfs/wear.h b/fs/eulerfs/wear.h new file mode 100644 index 000000000000..d0114813f1ef --- /dev/null +++ b/fs/eulerfs/wear.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_WEAR_H +#define EUFS_WEAR_H + +#include <linux/fs.h> + +extern int wear_threshold; +extern int wear_control; + +struct page_wear { + int wear; +}; + +void wear_init(struct super_block *sb); +void wear_fini(struct super_block *sb); +bool wear_inc(struct super_block *sb, void *page); + +#endif /* EUFS_WEAR_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/filename.h | 120 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 fs/eulerfs/filename.h
diff --git a/fs/eulerfs/filename.h b/fs/eulerfs/filename.h new file mode 100644 index 000000000000..1ded2fe77641 --- /dev/null +++ b/fs/eulerfs/filename.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_FILENAME_H +#define EUFS_FILENAME_H + +#include "alloc_interface.h" + +/* ========== filenames ========== */ +static __always_inline void eufs_free_name(struct super_block *sb, + struct nv_dict_entry *de) +{ + size_t len = HASHLEN_LEN(de->hv); + struct nv_name_ext *p; + struct nv_name_ext *next; + + if (likely(len <= FIRST_LEN)) + return; + p = s2p(sb, de->nextname); + len -= FIRST_LEN; + while (len > FOLLOW_LEN) { + next = s2p(sb, p->nextname); + nv_free(sb, p); + len -= FOLLOW_LEN; + p = next; + } + nv_free(sb, p); +} + +/* precondition: ext != NULL */ +/* Use with `eufs_free_page(page);` */ +static __always_inline void * +eufs_alloc_name_copy(struct super_block *sb, const char *name, size_t namelen, + const struct nv_name_ext *ext) +{ + char *page; + char *p; + size_t len; + + NV_ASSERT(namelen > FIRST_LEN); + NV_ASSERT(namelen <= EUFS_MAX_NAME_LEN); + + page = eufs_alloc_page(); + p = page; + memcpy(p, name, FIRST_LEN); + len = namelen - FIRST_LEN; + p += FIRST_LEN; + name = ext->name; + while (len > FOLLOW_LEN) { + memcpy(p, name, FOLLOW_LEN); + ext = s2p(sb, ext->nextname); + name = ext->name; + p += FOLLOW_LEN; + len -= FOLLOW_LEN; + } + memcpy(p, name, len); + *(char *)(p + len) = 0; + return page; +} +/* TODO: Handle allocation failure */ +static __always_inline int copy_filename(struct super_block *sb, + struct nv_dict_entry *de, hashlen_t hv, + const char *name) +{ + void *ext_pages[6]; + int n_ext_pages; + struct nv_name_ext *p; + struct nv_name_ext *new_p; + size_t len = HASHLEN_LEN(hv); + + BUILD_BUG_ON(FIRST_LEN + FOLLOW_LEN * 6 < EUFS_MAX_NAME_LEN); + BUG_ON(len > EUFS_MAX_NAME_LEN); + + de->hv = hv; + if (likely(len <= FIRST_LEN)) { + memcpy(de->name, name, len); + de->nextname = cpu_to_le64(EUFS_POISON_POINTER); + return 0; + } + n_ext_pages = 0; + memcpy(de->name, name, FIRST_LEN); + p = eufs_malloc_name_ext(sb); + de->nextname = p2s(sb, p); + if (!p) + goto NO_SPC; + ext_pages[n_ext_pages++] = p; + name += FIRST_LEN; + len -= FIRST_LEN; + + while (len > FOLLOW_LEN) { + memcpy(p->name, name, FOLLOW_LEN); + name += FOLLOW_LEN; + len -= FOLLOW_LEN; + new_p = eufs_malloc_name_ext(sb); + p->nextname = p2s(sb, new_p); + p = new_p; + if (!p) + goto NO_SPC; + ext_pages[n_ext_pages++] = p; + } + memcpy(p->name, name, len); + p->nextname = cpu_to_le64(EUFS_POISON_POINTER); + return 0; +NO_SPC: + while (n_ext_pages) + nv_free(sb, ext_pages[--n_ext_pages]); + return -ENOSPC; +} + +#endif /* EUFS_FILENAME_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
nv dict is a hash table, implement add, delete, find and other related interfaces.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/dht.c | 312 +++++++++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/dht.h | 156 ++++++++++++++++++++++++ 2 files changed, 468 insertions(+) create mode 100644 fs/eulerfs/dht.c create mode 100644 fs/eulerfs/dht.h
diff --git a/fs/eulerfs/dht.c b/fs/eulerfs/dht.c new file mode 100644 index 000000000000..f1b42cf7409b --- /dev/null +++ b/fs/eulerfs/dht.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/ratelimit.h> +#include <asm/cmpxchg.h> +#include "euler.h" +#include "dht.h" +#include "dep.h" + +#define GET_CRASH_VER(ptr) (((unsigned long)ptr) >> 56) + +void *fix_table(struct super_block *sb, struct nv_dict *dict, u32 idx) +{ + struct nv_dict_entry *he; + void *real_head; + u64 head_off; + + head_off = le64_to_cpu(dict->table[idx]); + if (!head_off) + return NULL; + if (head_off == EUFS_DIR_EOC) + return NULL; + real_head = o2p(sb, DICT_HEAD_REAL_OFF(head_off)); + + if (likely(GET_CRASH_VER(head_off) == EUFS_SB(sb)->s_crash_ver)) { + /* No need fix */ + return real_head; + } + he = real_head; + BUG_ON(he == EUFS_DIR_EOC_PTR); + while (he && he != EUFS_DIR_EOC_PTR) { + if (he->volatile_next) { + he->volatile_next = NULL_VAL; + eufs_flush_cacheline(he); + } + he = s2p(sb, he->next); + } + dict->table[idx] = COMPOSE_DICT_HEAD_le64(sb, real_head); + eufs_flush_cacheline(&dict->table[idx]); + eufs_pbarrier(); + return real_head; +} + +/* + * Insert to nv_dict using spinlocks. + * NOTICE: No resizing supported yet! + * Previous dentry is always the pointer + */ +struct nv_dict_entry *nv_dict_add(struct inode *dir, u64 **nv_header, u64 h, + const char *key, struct eufs_inode *pi) +{ + struct super_block *sb = dir->i_sb; + struct nv_dict __pmem *dict = + o2p(sb, eufs_iread_dict(EUFS_FRESH_PI(EUFS_PI(dir)))); + struct v_dict *volatile_dict = EUFS_I(dir)->i_volatile_dict; + u32 idx; + long err; + struct nv_dict_entry __pmem *de; + + idx = INDEX(h); + NV_ASSERT(dict); + NV_ASSERT(volatile_dict); + + /* NOTICE: simplified version w/o resizing */ + de = eufs_malloc_dentry(sb); + if (!de) + return ERR_PTR(-ENOSPC); + err = copy_filename(sb, de, h, key); + if (IS_ERR_VALUE(err)) { + nv_free(sb, de); + return ERR_PTR(-ENOSPC); + } + WARN_ON(!EUFS_IS_HEAD_PI(pi)); + de->inode = p2s(sb, pi); + + de->next = p2s(sb, + (volatile_dict->table[idx]) ? + (volatile_dict->table[idx] == EUFS_DIR_EOC_PTR ? + NULL : + volatile_dict->table[idx]) : + fix_table(sb, dict, idx)); + eufs_dentry_set_not_persist_flag(de); + + PRINT_DENTRY(de, "new dentry: "); + PRINT_PINODE(de->inode, "inode within dentry: "); + + *nv_header = &dict->table[idx]; + + /* Lock the header. It's to be released right after dep is locked. */ + inode_header_lock(dir); + volatile_dict->table[idx] = de; + + return de; +} + +/* + * Find from nv_dict with the protection of spinlock. + * No resizing support yet! + */ +struct nv_dict_entry *nv_dict_find(struct inode *dir, hashlen_t h, + const char *key) +{ + struct super_block *sb = dir->i_sb; + struct nv_dict __pmem *dict = + o2p(sb, eufs_iread_dict(EUFS_FRESH_PI(EUFS_PI(dir)))); + struct v_dict *volatile_dict = EUFS_I(dir)->i_volatile_dict; + + struct nv_dict_entry *he; + unsigned int idx; + + idx = INDEX(h); + + /* + * volatile_dict->table[idx] can be EOC after + * all entries have been deleted + */ + if (volatile_dict && volatile_dict->table[idx]) + he = volatile_dict->table[idx]; + else + he = fix_table(sb, dict, idx); + while (he && he != EUFS_DIR_EOC_PTR) { + __le64 vnext; + + if (key_equals(sb, key, h, he)) + break; + vnext = eufs_dentry_vnext(he); + he = s2p(sb, vnext ? vnext : he->next); + } + + if (he == EUFS_DIR_EOC_PTR) + he = NULL; + return he; +} + +/* + * Delete from nv_dict w/ spinlocks. + * No resizing support yet! + * Previous dentry is also returned + */ +struct nv_dict_entry *nv_dict_delete(struct inode *dir, + struct nv_dict_entry **prevde, + u64 **nv_header, hashlen_t h, + const char *key) +{ + struct super_block *sb = dir->i_sb; + struct nv_dict __pmem *dict = + o2p(sb, eufs_iread_dict(EUFS_FRESH_PI(EUFS_PI(dir)))); + struct eufs_inode_info *dir_vi = EUFS_I(dir); + struct v_dict *volatile_dict = dir_vi->i_volatile_dict; + + struct nv_dict_entry *he; + struct nv_dict_entry *prev = NULL; + unsigned int idx; + __le64 vnext; + + NV_ASSERT(dict); + NV_ASSERT(volatile_dict); + + idx = INDEX(h); + + he = volatile_dict->table[idx] ? volatile_dict->table[idx] : + fix_table(sb, dict, idx); + + while (he && he != EUFS_DIR_EOC_PTR) { + if (key_equals(sb, key, h, he)) + break; + prev = he; + vnext = eufs_dentry_vnext(he); + /* EOC is not NULL, so it's okay. */ + he = s2p(sb, vnext ? vnext : he->next); + } + + if (he && he != EUFS_DIR_EOC_PTR) { + /* Lock the header. It's to be released right after dep is locked. */ + inode_header_lock(dir); + + vnext = eufs_dentry_vnext(he); + if (!prev) { + /* + * the first dentry (head of the chain). + * If the target is the end of chain, it is the only + * dentry in the chain, then either its volatile_next + * is EOC, or its next is NULL. + */ + volatile_dict->table[idx] = + s2p(sb, vnext ? vnext : he->next); + if (volatile_dict->table[idx] == NULL) + volatile_dict->table[idx] = EUFS_DIR_EOC_PTR; + } else { + bool persist_prev = !eufs_dentry_is_not_persist(prev); + + if (!persist_prev) { + /* + * Protect against the persistence of prev dentry + * by background persister. + */ + spin_lock(&dir_vi->i_dentry_persist_lock); + + persist_prev = + !eufs_dentry_is_not_persist(prev); + if (!persist_prev) { + /* + * Prev is a newly created dentry, + * Keep the property, + * Two pointers are updated together, + * no need to worry about the EOC. + */ + prev->next = + vnext ? (vnext == EUFS_DIR_EOC ? + NULL_VAL : + vnext) : + he->next; + eufs_dentry_set_not_persist_flag(prev); + } + + spin_unlock(&dir_vi->i_dentry_persist_lock); + } + + if (persist_prev) + prev->volatile_next = + vnext ? vnext : + (he->next ? he->next : + EUFS_DIR_EOC); + } + + if (eufs_dentry_is_not_persist(he)) + he->volatile_next = EUFS_DIR_DELNEW; + + *prevde = prev; + /* table[idx] must have been fixed, so it's OK to return it. */ + *nv_header = &dict->table[idx]; + } else if (he == EUFS_DIR_EOC_PTR) { + he = NULL; + } + + return he; +} + +void nv_dict_scan_via_ptr(struct inode *dir, u64 pos, + int (*fn)(void *privdata, + const struct nv_dict_entry *de), + void *privdata) +{ + struct super_block *sb = dir->i_sb; + struct nv_dict __pmem *dict = + o2p(sb, eufs_iread_dict(EUFS_FRESH_PI(EUFS_PI(dir)))); + struct v_dict *volatile_dict = EUFS_I(dir)->i_volatile_dict; + + const struct nv_dict_entry *de = 0; + u64 idx; + u64 i; + u64 skip; + struct dir_scan_data *data = (struct dir_scan_data *)privdata; + struct dir_context *ctx = data->ctx; + int err; + + if (ctx->pos == EUFS_DIR_DOTDOT) { + idx = 0; + skip = 0; + } else { + idx = CURSOR_IDX(pos); + skip = CURSOR_CNT(pos); + } + + /* Next to emit: the skip-th element in dict->table[idx] */ + while (idx < NV_DICT_CAPACITY) { + if (!de) { + eufs_ptr_fast_check(dict); + eufs_ptr_fast_check(dict->table); + de = (volatile_dict && volatile_dict->table[idx]) ? + volatile_dict->table[idx] : + fix_table(sb, dict, idx); + } + i = 0; + while (de && de != EUFS_DIR_EOC_PTR) { + __le64 vnext; + + /* current is the i-th de in list */ + /* skip de's remaining to skip */ + if (skip == 0) { + err = fn(privdata, de); + if (err) + return; + /* ctx->pos points to the next de */ + ctx->pos = CURSOR(idx, i + 1); + } else + skip--; + i++; + vnext = eufs_dentry_vnext(de); + de = s2p(sb, vnext ? vnext : de->next); + } + if (de == EUFS_DIR_EOC_PTR) + de = NULL; + idx++; /* next idx */ + skip = 0; + } + + ctx->pos = EUFS_DIR_EODIR; +} diff --git a/fs/eulerfs/dht.h b/fs/eulerfs/dht.h new file mode 100644 index 000000000000..bf67f6925685 --- /dev/null +++ b/fs/eulerfs/dht.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DHT_H +#define EUFS_DHT_H + +#include <linux/atomic.h> +#include "filename.h" + +#define DICT_OK (0) +#define DICT_ERR (1) + +#define INDEX(h) (h & (NV_DICT_CAPACITY - 1)) + +#define CURSOR_IDX(csr) ((csr >> 32) & 0xffffffff) +#define CURSOR_CNT(csr) (csr & 0xffffffff) +#define CURSOR(idx, cnt) (((idx) << 32) | (cnt)) + +/* End of dir */ +#define EUFS_DIR_EODIR (CURSOR(NV_DICT_CAPACITY, 2)) +/* Offset of . in a dir */ +#define EUFS_DIR_DOT (CURSOR(NV_DICT_CAPACITY, 0)) +/* Offset of .. in a dir */ +#define EUFS_DIR_DOTDOT (CURSOR(NV_DICT_CAPACITY, 1)) + +#define EUFS_DENTRY_FLAG_NOT_PERSIST 1 +#define EUFS_DENTRY_FLAGS_MASK (~1ULL) + +/* The total size of the hash table (buckets) is 8B * 512 = 4KB */ + +struct v_dict { + struct nv_dict_entry *table[NV_DICT_CAPACITY]; +} __aligned(PAGE_SIZE); + +struct nv_dict_entry *nv_dict_add(struct inode *dir, u64 **nv_header, u64 h, + const char *key, struct eufs_inode *pi); + +struct nv_dict_entry *nv_dict_find(struct inode *dir, hashlen_t h, + const char *key); + +struct nv_dict_entry *nv_dict_delete(struct inode *dir, + struct nv_dict_entry **prevde, + u64 **nv_header, hashlen_t h, + const char *key); + +void nv_dict_scan_via_ptr(struct inode *dir, u64 pos, + int (*fn)(void *privdata, + const struct nv_dict_entry *de), + void *privdata); + +#define EUFS_PRINT_PI(pi, msg) \ + eufs_info( \ + msg \ + " pi=%px, pi->i_mode=%x, pi->i_nlink=%x, " \ + "pi->root=0x%llx, pi->i_size=0x%llx, pi->i_dotdot=0x%llx\n", \ + pi, pi ? eufs_iread_mode(pi) : 0, \ + pi ? eufs_iread_nlink(pi) : 0, pi ? eufs_iread_root(pi) : 0, \ + pi ? eufs_iread_size(pi) : 0, \ + pi ? eufs_iread_dotdot(pi) : 0) + +#define EUFS_PRINT_PI_INODE(msg, pi, inode) \ + eufs_info(msg " pi=%px inode=%px; " \ + "pi->i_mode=0%o inode->i_mode=0%o; " \ + "pi->i_nlink=0x%x inode=i_nlink=0x%x; " \ + "pi->root=0x%llx inode->root=%px; " \ + "pi->i_size=0x%llx inode->i_size=0x%llx; " \ + "pi->i_dotdot=0x%llx\n", \ + pi, inode, pi ? eufs_iread_mode(pi) : 0, \ + inode ? inode->i_mode : 0, pi ? eufs_iread_nlink(pi) : 0, \ + inode ? inode->i_nlink : 0, pi ? eufs_iread_root(pi) : 0, \ + inode ? EUFS_I(inode)->i_volatile_root : 0, \ + pi ? eufs_iread_size(pi) : 0, inode ? inode->i_size : 0, \ + pi ? pi->i_dotdot : 0) + +#define _PRINT_DENTRY(de, msg) \ + { \ + char *page; \ + if (HASHLEN_LEN(de->hv) > FIRST_LEN) { \ + page = eufs_alloc_name_copy( \ + de->name, HASHLEN_LEN(de->hv), de->nextname); \ + info(msg " de=%px, de->name=[%px]%*s, de->inode=%px, " \ + "de->next=%px, de->volatile_next=%px\n", \ + de, de->name, (int)HASHLEN_LEN(de->hv), page, \ + de->inode, de->next, de->volatile_next); \ + eufs_free_page(page); \ + } else { \ + info(msg " de=%px, de->name=[%px]%*s, de->inode=%px, " \ + "de->next=%px, de->volatile_next=%px\n", \ + de, de->name, (int)HASHLEN_LEN(de->hv), de->name, \ + de->inode, de->next, de->volatile_next); \ + } \ + } + +#define _PRINT_PINODE(pi, msg) EUFS_PRINT_PI(pi, msg) + +#define PRINT_DENTRY(de, msg) +#define PRINT_PINODE(pi, msg) + +void *fix_table(struct super_block *sb, struct nv_dict *dict, u32 idx); + +/* Rule for encoded pointers: + * encoding: o2s(encode(p2o())) + * decoding: o2p(decode(s2o())) + */ +#define COMPOSE_DICT_HEAD_le64(sb, head) \ + ((__le64)((void *)head == NULL ? \ + NULL_VAL : \ + (cpu_to_le64( \ + ((u64)(p2o(sb, head) & ((1UL << 56) - 1)) | \ + ((u64)EUFS_SB(sb)->s_crash_ver & 0xff) \ + << 56))))) + +#define DICT_HEAD_REAL_OFF(head_off) \ + ((u64)((u64)(head_off) & ((1UL << 56) - 1))) + +static inline __le64 eufs_dentry_vnext(const struct nv_dict_entry *entry) +{ + __le64 vnext = entry->volatile_next; + + if (vnext != EUFS_DIR_EOC) + vnext = vnext & cpu_to_le64(EUFS_DENTRY_FLAGS_MASK); + + return vnext; +} + +static inline bool +eufs_dentry_is_not_persist(const struct nv_dict_entry *entry) +{ + return (entry->volatile_next == + (entry->next | cpu_to_le64(EUFS_DENTRY_FLAG_NOT_PERSIST))); +} + +static inline void +eufs_dentry_clr_not_persist_flag(struct nv_dict_entry *entry) +{ + entry->volatile_next &= cpu_to_le64(EUFS_DENTRY_FLAGS_MASK); +} + +static inline void +eufs_dentry_set_not_persist_flag(struct nv_dict_entry *entry) +{ + entry->volatile_next = + entry->next | cpu_to_le64(EUFS_DENTRY_FLAG_NOT_PERSIST); +} + +#endif /* EUFS_DHT_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Operations like mkdir / rmdir will create a dependency node. The node will be inserted to inode i_dep_list and will be handled by persistence later.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/dep.c | 791 +++++++++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/dep.h | 218 +++++++++++++ 2 files changed, 1009 insertions(+) create mode 100644 fs/eulerfs/dep.c create mode 100644 fs/eulerfs/dep.h
diff --git a/fs/eulerfs/dep.c b/fs/eulerfs/dep.c new file mode 100644 index 000000000000..ec014bbf3700 --- /dev/null +++ b/fs/eulerfs/dep.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/ratelimit.h> +#include <linux/writeback.h> +#include "euler.h" +#include "dep.h" +#include "lock.h" +#include "dax.h" +#include "dht.h" + +static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep, + u64 *bitset); + +struct flush_list_head { + int count; + struct llist_head head; +}; + +DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu); + +#define IFMT_HAS_ROOT(ifmt) \ + ((ifmt) == S_IFREG || (ifmt) == S_IFDIR || (ifmt) == S_IFLNK) + +#define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr) \ + do { \ + tag: \ + if (enter_cond) { \ + if (likely(inode_trylock(inode))) { \ + /* get the lock, okay */ \ + } else { \ + if (exit_cond) { \ + exit_expr; \ + } else { \ + cond_resched(); \ + goto tag; \ + } \ + } \ + } \ + } while (0) + +static inline void fsync_dir_oneshot(struct inode *dir) +{ + eufs_dir_fsync_oneshot(dir); +} + +static void do_dep_dirrem(struct inode *inode, struct dep_node *dep, + u64 *bitset) +{ + struct nv_dict_entry *prevde = dep->prevde; + struct nv_dict_entry *de = dep->de; + int idx; + + eufs_dbg("!! %s !!", __func__); + NV_ASSERT(de); + NV_ASSERT(de->inode); + NV_ASSERT(de->name); + + idx = INDEX(de->hv); + bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63)); + eufs_dbg("bitset-add: dict=%llx, %d %llx\n", + eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]); + + /* + * This is a removal of a newly created dentry, nothing to do, + * the prevde is already manipulated in dht.c + */ + if (de->volatile_next == EUFS_DIR_DELNEW) + return; + + /* + * If dentries immediately following the deleted dentry are + * also deleted, prevde->volatile_next will be modified again. + * So if we assign prevde->volatile_next to prevde->next, + * these deletion will be persisted prematurely. + */ + if (prevde && !eufs_dentry_is_not_persist(prevde)) { + prevde->next = de->next; + persist_dentry(prevde); + } +} + +static void do_dep_dirrem_reclaim(struct super_block *sb, struct dep_node *dep) +{ + struct nv_dict_entry *de = dep->de; + struct eufs_inode __maybe_unused *pi; + struct inode *child; + + pi = s2p(sb, de->inode); + child = dep->inode; + NV_ASSERT(EUFS_PI(child) == pi); + eufs_dbg("dirrem: child_inode=%px\n", child); + BUG_ON(!child); + eufs_free_name(sb, de); + nv_free(sb, de); +} + +#define EUFS_PRINT_BITSET(lvl, bitset) \ + eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n", \ + bitset[0], bitset[1], bitset[2], bitset[3], bitset[4], \ + bitset[5], bitset[6], bitset[7]) + +static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8]) +{ + struct inode *inode = &vi->vfs_inode; + struct super_block *sb = inode->i_sb; + struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode)); + struct nv_dict *dict; + int i; + + /* Volatile buckets */ + if (!vi->i_volatile_dict) + return; + + EUFS_PRINT_BITSET(dbg, bitset); + + BUG_ON(!inode_is_header_locked(inode)); + dict = o2p(sb, eufs_iread_dict(pi)); + for (i = 0; i < 8; ++i) { + int j; + bool dirty; + int idx; + + if (!bitset[i]) + continue; + dirty = false; + for (j = 0; j <= 64; ++j) { + if (j % 8 == 0 && dirty) { + dirty = false; + eufs_flush_cacheline(&dict->table[idx]); + } + if (j == 64) + break; + if (!(bitset[i] & (0x1ull << j))) + continue; + idx = i * 64 + j; + eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n", + idx, i, j, inode); + + eufs_dbg_dir(" idx=%d dict[idx]=%px vdict[idx]=%px\n", + idx, dict->table[idx], + vi->i_volatile_dict->table[idx]); + + if (unlikely(vi->i_volatile_dict->table[idx] == + EUFS_DIR_EOC_PTR)) + dict->table[idx] = NULL_VAL; + else if (vi->i_volatile_dict->table[idx] != NULL) + dict->table[idx] = COMPOSE_DICT_HEAD_le64( + sb, vi->i_volatile_dict->table[idx]); + vi->i_volatile_dict->table[idx] = NULL; + dirty = true; + } + } +} + +/* + * Some ideas on fast fsync (of dir): + * + * 1. Batch and coalescence. The newly inserted dentry should be marked and + * during its removal, it should be marked again so that unnecessary dep_diradd + * an be prevented. + * + * 2. Split! The lock (only when there is one lock needed) can be temporarily + * given up so between handling two deps. This requires that the dentry pointed + * by dir_pi should not be reclaimed (like in RCU). Well, actually, combined + * with the following one idea, this is quite acceptable. + * + * 3. Delayed free. The removal operations can be delayed until the locks are + * released. + * + * + * Parallel fsync for a vi is not throughly considered though. + * + * 4. Detach only if the list is empty? + */ +static void fsync_rename_inode(struct inode *dir) +{ + struct eufs_inode_info *vi = EUFS_I(dir); + + if (!vi->i_is_dirty) + return; + + /* I'm holding the lock, so if it's dirty, it's dirty. */ + fsync_dir_oneshot(dir); +} + +void fsync_rename_inodes(struct inode *old_dir, struct inode *new_dir, + struct inode **locked_inodes) +{ + int i; + struct inode *inode; + + /* + * The two parent dirs, might have parent-child relations sometime + * before. So we need to transfer these two dirs too. + */ + for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) { + inode = locked_inodes[i]; + if (inode) + eufs_inode_mark_lock_transferable(inode); + } + + if (old_dir == new_dir) { + fsync_rename_inode(old_dir); + } else { + fsync_rename_inode(old_dir); + fsync_rename_inode(new_dir); + } + + for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) { + inode = locked_inodes[i]; + if (inode) + eufs_inode_wait_lock_transfer_done(inode); + } +} + +static void eufs_update_persisted_seq(struct eufs_inode_info *vi, + struct list_head *head) +{ + if (!list_empty(head)) { + struct dep_node *dep = + list_last_entry(head, struct dep_node, node); + + vi->i_persisted_dep_seq = dep->seq; + } +} + +static int fsync_dir_bg(struct inode *dir) +{ + struct dep_node *dep, *next; + LIST_HEAD(detached_list); + LIST_HEAD(dump_list); + int i; +#define FSYNC_DIR_VI_LOOP_NUM (20) + + struct eufs_inode_info *vi = EUFS_I(dir); + struct super_block *sb = dir->i_sb; + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct eufs_inode *pi = EUFS_PI(dir); + u64 bitset[8] = { 0 }; + int dep_count = 0; + +retry: + inode_urgent_lock(dir); + + /* Phase 1 */ + for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) { + /* Get all deps round by round */ + if (i == 0) { + /* Last round */ + inode_header_lock(dir); + } + inode_dep_lock(dir); + + if (list_empty(&vi->i_dep_list) && i > 0) { + /* Skip to last round */ + i = 1; + } + list_cut_position(&detached_list, &vi->i_dep_list, + vi->i_dep_list.prev); + + if (i > 0) + inode_dep_unlock(dir); + + /* Do dep one by one. */ + list_for_each_entry_safe(dep, next, &detached_list, node) { + if (dep->type == DEP_DIRADD) { + /* + * FIXME: the lockset might be different since + * we might have released the inode lock. + */ + do_dep_diradd_oneshot(dir, dep, bitset); + + } else if (dep->type == DEP_DIRREM) { + do_dep_dirrem(dir, dep, bitset); + + } else + BUG(); + } + + list_splice_tail_init(&detached_list, &dump_list); + + if (i == 0) { + eufs_pbarrier(); + + if (!list_empty(&dump_list)) + /* Phase 2 */ + eufs_sync_buckets(vi, bitset); + + inode_dep_unlock(dir); + inode_header_unlock(dir); + break; + } + } + + inode_urgent_unlock(dir); + + /* Phase 3 */ + inode_lock(dir); + + if (!list_empty(&vi->i_dep_list)) { + inode_unlock(dir); + /* To handle new deps between phase 2 & 3 */ + /* FIXME: Live lock possible! */ + goto retry; + } + + if (dir->i_nlink) + eufs_sync_pinode(dir, pi, false); + + eufs_update_persisted_seq(vi, &dump_list); + + vi->i_is_persisting = false; + vi->i_is_dirty = false; + + if (dir->i_nlink) + persist_pinode(pi); + + inode_unlock(dir); + + eufs_pbarrier(); + + /* Reclaim memory and clear the list */ + list_for_each_entry_safe(dep, next, &dump_list, node) { + struct inode *child_inode = dep->inode; + struct eufs_inode_info *child_vi = EUFS_I(child_inode); + + if (dep->type == DEP_DIRREM) + do_dep_dirrem_reclaim(sb, dep); + + /* remove from owner list */ + spin_lock(&child_vi->i_owner_lock); + list_del_init(&dep->owner_node); + spin_unlock(&child_vi->i_owner_lock); + + iput(child_inode); + + list_del(&dep->node); + + eufs_free_dep_node(dep); + dep_count++; + } + atomic_sub(dep_count, &sbi->s_nr_dep_nodes); + eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n", + smp_processor_id(), &vi->vfs_inode); + return 0; +} + +static int fsync_nondir_oneshot(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + struct eufs_inode *pi; + + /* For files other than dir */ + WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__); + + /* Inode needs to remove. Nothing to do */ + if (!inode->i_nlink) { + vi->i_is_dirty = false; + return 0; + } + + pi = EUFS_PI(inode); + + eufs_sync_pinode(inode, pi, false); + + persist_pinode(pi); + + vi->i_is_dirty = false; + + return 0; +} + +static int fsync_nondir_bg(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + int r; + + inode_lock(inode); + r = fsync_nondir_oneshot(inode); + vi->i_is_persisting = false; + inode_unlock(inode); + + return r; +} + +static void fsync_bg(struct inode *inode) +{ + struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb); + + wait_on_inode(inode); + + /* Reading i_mode may need no protection */ + if (S_ISDIR(inode->i_mode)) + fsync_dir_bg(inode); + else + fsync_nondir_bg(inode); + + /* Decrease */ + iput(inode); + + if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) { + /* end of draining */ + sbi->s_draining = false; + } +} + +void fsync_oneshot(struct inode *inode) +{ + /* Reading i_mode may need no protection */ + if (S_ISDIR(inode->i_mode)) + fsync_dir_oneshot(inode); + else + fsync_nondir_oneshot(inode); +} + +static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep, + u64 *bitset) +{ + struct super_block *sb = dir_inode->i_sb; + struct nv_dict_entry *de = dep->de; + struct inode *inode = dep->inode; + struct eufs_inode_info *dir_vi = EUFS_I(dir_inode); + struct eufs_inode *pi; + struct eufs_inode *fresh_pi; + int idx; + void *buffer[16]; + struct alloc_batch ab; + bool lock_transferred = false; + + idx = INDEX(de->hv); + bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63)); + + if (de->volatile_next == EUFS_DIR_DELNEW) { + /* + * The de is already invisible from both the latest view and + * the consistent view. + * Will be handled in the corresponding dirrem. + */ + return; + } + + /* Meow? This equality is the sign of diradd */ + WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign"); + + pi = s2p(sb, de->inode); + + wait_on_inode(inode); +retry: + if (likely(inode_trylock(inode))) { + /* Got the lock */ + } else { + if (eufs_inode_mark_lock_transferring(inode)) { + lock_transferred = true; + } else { + cond_resched(); + goto retry; + } + } + + eufs_sync_pinode(inode, pi, false); + fresh_pi = EUFS_FRESH_PI(pi); + + if (!lock_transferred) + inode_unlock(inode); + else + eufs_inode_lock_transfer_done(inode); + + ab.n_used = 0; + ab.size = 16; + ab.batch = buffer; + + eufs_alloc_batch_add(sb, &ab, de); + /* + * force to persist the allocation without checking. + * TODO: we should differentiate the link and create syscall to agree + * with checking + */ + eufs_alloc_persist(sb, pi, true); + + if (S_ISLNK(fresh_pi->i_mode)) { + void *root = o2p(sb, eufs_iread_root(fresh_pi)); + + /* reg file's root is done in btree */ + /* In case of Hard link, we must force the allocation persitence */ + eufs_alloc_persist(sb, root, true); + persist_symlink(root); + } else if (S_ISDIR(fresh_pi->i_mode)) { + void *root = o2p(sb, eufs_iread_root(fresh_pi)); + + eufs_alloc_persist(sb, root, false); + persist_page(root); + } + + persist_name(sb, de, &ab); + + eufs_alloc_batch_persist_reset(sb, &ab); + + persist_pinode(pi); + + spin_lock(&dir_vi->i_dentry_persist_lock); + eufs_dentry_clr_not_persist_flag(de); + spin_unlock(&dir_vi->i_dentry_persist_lock); + + persist_dentry(de); +} + +void eufs_dir_fsync_oneshot(struct inode *dir) +{ + struct dep_node *dep; + struct dep_node *next; + struct super_block *sb = dir->i_sb; + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct eufs_inode_info *vi = EUFS_I(dir); + LIST_HEAD(detached_list); + u64 bitset[8] = { 0 }; + int dep_count = 0; + + BUG_ON(!inode_is_locked(dir)); + + inode_urgent_lock(dir); + + /* get all deps */ + inode_header_lock(dir); + inode_dep_lock(dir); + + if (list_empty(&vi->i_dep_list)) + goto unlock_sync_pinode; + + list_for_each_entry(dep, &vi->i_dep_list, node) { + if (dep->type == DEP_DIRADD) + do_dep_diradd_oneshot(dir, dep, bitset); + else if (dep->type == DEP_DIRREM) + do_dep_dirrem(dir, dep, bitset); + else + BUG(); + } + + list_splice_init(&vi->i_dep_list, &detached_list); + + /* sync buckets */ + eufs_pbarrier(); + eufs_sync_buckets(vi, bitset); + +unlock_sync_pinode: + inode_dep_unlock(dir); + inode_header_unlock(dir); + + /* sync pinode */ + if (dir->i_nlink) + eufs_sync_pinode(dir, EUFS_PI(dir), false); + + eufs_pbarrier(); + + eufs_update_persisted_seq(vi, &detached_list); + + vi->i_is_dirty = false; + + /* Reclaim memory and clear the list */ + list_for_each_entry_safe(dep, next, &detached_list, node) { + struct inode *child_inode = dep->inode; + struct eufs_inode_info *child_vinode = EUFS_I(child_inode); + + spin_lock(&child_vinode->i_owner_lock); + list_del_init(&dep->owner_node); + spin_unlock(&child_vinode->i_owner_lock); + + if (dep->type == DEP_DIRREM) { + do_dep_dirrem_reclaim(sb, dep); + iput(dep->inode); + } else if (dep->type == DEP_DIRADD) { + iput(dep->inode); + } + list_del(&dep->node); + eufs_free_dep_node(dep); + dep_count++; + } + atomic_sub(dep_count, &sbi->s_nr_dep_nodes); + + inode_urgent_unlock(dir); +} + +void fsync_on_draining(struct inode *dir, struct inode *inode) +{ + BUG_ON(!dir); + BUG_ON(!inode_is_locked(dir)); + BUG_ON(inode && !inode_is_locked(inode)); + + /* for link/unlink/rmdir */ + if (inode) + eufs_inode_mark_lock_transferable(inode); + + fsync_dir_oneshot(dir); + + if (inode) + eufs_inode_wait_lock_transfer_done(inode); +} + +#define NR_FLUSH_EACH_ROUND (16) +#define FLUSH_START_THRESHOLD (64) + +static __always_inline int handle_persistees_for_each_cpu( + struct super_block *sb, const struct cpumask *mask, int idx) { + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct llist_node *list; + struct llist_head *head; + struct eufs_inode_info *vi; + struct eufs_inode_info *next; + int n_active_list; + int cpu; + bool need; + +retry: + need = sbi->need_sync[idx]; + n_active_list = 0; + for_each_cpu(cpu, mask) { + head = per_cpu_ptr(sbi->persistee_list, cpu); + + if (unlikely(llist_empty(head))) + continue; + + n_active_list++; + + list = llist_del_all(head); + + eufs_dbg("persister get list %px for cpu%d\n", list, cpu); + + /* reverse the ordering for better locality? */ + llist_for_each_entry_safe(vi, next, list, i_persistee_node) + fsync_bg(&vi->vfs_inode); + eufs_dbg("persister handled list %px\n", list); + } + /** + * We need a complete round of run for fssync. If + * need != sbi->need_sync[idx], need_sync was modified during our last + * round. We need to retry to ensure a complete round of run. + * It's okay if dirty inodes of a cpu is still being processed by + * another persister, since we will wait for all persisters to finish + * for fssync. + */ + if (need != READ_ONCE(sbi->need_sync[idx])) + goto retry; + if (need) { + sbi->need_sync[idx] = false; + wake_up(&sbi->sync_wq); + } + if (READ_ONCE(sbi->need_sync[idx])) + goto retry; + + return n_active_list; +} + +static int persister(void *data) +{ + struct super_block *sb = data; + struct eufs_sb_info *sbi = EUFS_SB(sb); + const struct cpumask *mask = cpumask_of_node(numa_node_id()); + const int period = + (persist_period == 0) ? /* default */ (HZ / 4) : + /* less than a second */ + ((persist_period < 0) ? (HZ / (-persist_period)) : + /* more than a second */ + (HZ * persist_period)); + int idx = 0; + int num_persisters = num_sockets * persisters_per_socket; + + eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data, + smp_processor_id(), cpumask_pr_args(mask), period); + + while (idx < num_persisters && sbi->persisters[idx] != current) + idx++; + BUG_ON(idx >= num_persisters); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(period); + handle_persistees_for_each_cpu(sb, mask, idx); + } + + while (handle_persistees_for_each_cpu(sb, mask, idx)) + cpu_relax(); + + eufs_info("finalizing on %d\n", smp_processor_id()); + + return 0; +} + +int dep_init(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + int cpu; + int i, j; + char name[BDEVNAME_SIZE]; + int err; + + sbi->persistee_list = alloc_percpu(struct llist_head); + if (!sbi->persistee_list) { + err = -ENOMEM; + goto cleanup; + } + + /* init each llist */ + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu)); + + sbi->persisters = kmalloc(sizeof(struct task_struct *) * + persisters_per_socket * num_sockets, + GFP_KERNEL); + if (!sbi->persisters) { + err = -ENOMEM; + goto cleanup; + } + + sbi->need_sync = kzalloc( + sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL); + if (!sbi->need_sync) { + err = -ENOMEM; + goto cleanup; + } + + init_waitqueue_head(&sbi->sync_wq); + + bdevname(sb->s_bdev, name); + for (i = 0; i < num_sockets; ++i) { + for (j = 0; j < persisters_per_socket; ++j) { + int idx = i * persisters_per_socket + j; + + sbi->persisters[idx] = kthread_create_on_node( + persister, sb, i, "hmfs/%s-%d.%d", name, i, j); + + if (IS_ERR(sbi->persisters[idx])) { + err = PTR_ERR(sbi->persisters[idx]); + pr_err("create persister %s-%d.%d error %d", + name, i, j, err); + sbi->persisters[idx] = NULL; + goto cleanup; + } + + set_cpus_allowed_ptr(sbi->persisters[idx], + cpumask_of_node(i)); + + wake_up_process(sbi->persisters[idx]); + } + } + + return 0; + +cleanup: + dep_fini(sb); + return err; +} + +void dep_fini(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + if (sbi->persisters) { + int i; + + for (i = 0; i < persisters_per_socket * num_sockets; ++i) { + if (sbi->persisters[i]) { + kthread_stop(sbi->persisters[i]); + sbi->persisters[i] = NULL; + } + } + + kfree(sbi->persisters); + sbi->persisters = NULL; + } + + kfree(sbi->need_sync); + sbi->need_sync = NULL; + + free_percpu(sbi->persistee_list); + sbi->persistee_list = NULL; +} diff --git a/fs/eulerfs/dep.h b/fs/eulerfs/dep.h new file mode 100644 index 000000000000..16657f3cf6ce --- /dev/null +++ b/fs/eulerfs/dep.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DEP_H +#define EUFS_DEP_H + +#include <linux/llist.h> +#include <linux/list.h> +#include <linux/fs.h> +#include "euler.h" +#include "alloc_interface.h" + +/** + * Dep type: + * - diradd (for create/symlink/link/mknod) + * - dirrem + */ + +enum fsync_type { + FSYNC_DEP, + FSYNC_RENAME, + FSYNC_SYSCALL, +}; + +extern int disable_persisters; +extern int persist_period; +extern int persisters_per_socket; + +#define eufs_dep_seq_after(a, b) ((s32)((b) - (a)) < 0) +#define eufs_dep_seq_after_eq(a, b) ((s32)((a) - (b)) >= 0) + +void eufs_dir_fsync_oneshot(struct inode *dir); +void fsync_on_draining(struct inode *dir, struct inode *inode); + +void fsync_rename_inodes(struct inode *old_inode, struct inode *new_inode, + struct inode **locked_inodes); + +void fsync_oneshot(struct inode *inode); + +enum dep_type { + DEP_DIRADD, /* Hard link is detected by checking inode->i_nlink */ + DEP_DIRREM, + DEP_TYPE_COUNT, + +}; + +struct dep_node { + struct list_head node; + struct list_head owner_node; + u32 seq; + /* Type of the dependency */ + enum dep_type type; + /* Previous dentry */ + struct nv_dict_entry *prevde; + /* header of the list */ + u64 *nv_header; + /* Related Dentry, which also points to an inode */ + struct nv_dict_entry __pmem *de; + /* inode for de->pi */ + struct inode *inode; + struct inode *dir; +} __aligned(CACHELINE_SIZE); + +int dep_init(struct super_block *sb); +void dep_fini(struct super_block *sb); + +static __always_inline void request_persistence(struct inode *inode) +{ + struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb); + struct eufs_inode_info *vi = EUFS_I(inode); + int cpu; + + BUG_ON(!inode_is_locked(inode)); + + if (!vi->i_is_dirty) + vi->i_is_dirty = true; + + if (vi->i_is_persisting) + return; + + cpu = get_cpu(); + llist_add(&vi->i_persistee_node, per_cpu_ptr(sbi->persistee_list, cpu)); + put_cpu(); + + eufs_dbg_vlimit("sbi->s_nr_dirty_inodes=%d ++ vi=%px @cpu=%d\n", + atomic_read(&sbi->s_nr_dirty_inodes), vi, cpu); + + if (atomic_inc_return(&sbi->s_nr_dirty_inodes) > max_dirty_inodes && + !sbi->s_draining) + sbi->s_draining = true; + + vi->i_is_persisting = true; + ihold(inode); +} + +/* precondition: dir inode is mutex-locked */ +static __always_inline void dep_insert(struct inode *dir, struct dep_node *dep) +{ + struct eufs_inode_info *dir_vi = EUFS_I(dir); + struct eufs_inode_info *child_vi = EUFS_I(dep->inode); + struct eufs_sb_info *sbi = EUFS_SB(dir->i_sb); + + inode_dep_lock(dir); + inode_header_unlock(dir); + list_add_tail(&dep->node, &dir_vi->i_dep_list); + spin_lock(&child_vi->i_owner_lock); + list_add_tail(&dep->owner_node, &child_vi->i_owner_list); + spin_unlock(&child_vi->i_owner_lock); + inode_dep_unlock(dir); + + eufs_dbg_vlimit("sbi->s_nr_dep_nodes=%d ++\n", + atomic_read(&sbi->s_nr_dep_nodes)); + if (atomic_inc_return(&sbi->s_nr_dep_nodes) > max_dep_nodes && + !sbi->s_draining) { + sbi->s_draining = true; + } + + /* Request a persistence */ + request_persistence(dir); +} + +static __always_inline bool eufs_valid_inode_in_de(struct nv_dict_entry *de, + struct inode *inode) +{ + return (le64_to_cpu(de->inode) == inode->i_ino); +} + +static __always_inline void +dep_new_insert(struct dep_node *dep, struct inode *dir, enum dep_type type, + struct nv_dict_entry *prevde, u64 *nv_header, + struct nv_dict_entry *de, struct inode *inode, u32 seq) +{ + dep->type = type; + dep->prevde = prevde; + dep->nv_header = nv_header; + dep->de = de; + dep->inode = inode; + dep->dir = dir; + dep->seq = seq; + NV_ASSERT(eufs_valid_inode_in_de(dep->de, dep->inode)); + ihold(dep->inode); + dep_insert(dir, dep); +} + +static __always_inline void persist_dentry(struct nv_dict_entry *de) +{ + NV_ASSERT(de); + NV_ASSERT((u64)de % CACHELINE_SIZE == 0); + NV_ASSERT(sizeof(de) <= CACHELINE_SIZE); + eufs_flush_cacheline(de); +} + +static __always_inline void persist_pinode(struct eufs_inode *pi) +{ + WARN_ON(!EUFS_IS_HEAD_PI(pi)); + NV_ASSERT(pi); + NV_ASSERT((u64)pi % CACHELINE_SIZE == 0); + NV_ASSERT(sizeof(pi) <= EUFS_INODE_SIZE); + eufs_flush_cacheline(EUFS_FRESH_PI(pi)); + eufs_flush_cacheline(&EUFS_FRESH_PI(pi)->i_fresh); +} + +static __always_inline void persist_name(struct super_block *sb, + const struct nv_dict_entry *de, + struct alloc_batch *ab) +{ + size_t len = HASHLEN_LEN(de->hv); + struct nv_name_ext *next; + const char *name; + + if (likely(len <= FIRST_LEN)) { + /* embedded in de */ + return; + } + next = s2p(sb, de->nextname); + len -= FIRST_LEN; + name = next->name; + eufs_alloc_batch_add(sb, ab, (void *)name); + while (len > FOLLOW_LEN) { + next = s2p(sb, next->nextname); + eufs_flush_cacheline(name); + len -= FOLLOW_LEN; + name = next->name; + eufs_alloc_batch_add(sb, ab, (void *)name); + } + eufs_flush_cacheline(name); +} + +static __always_inline void persist_symlink(void *root) +{ + u64 len; + + NV_ASSERT(root); + NV_ASSERT(((u64)root) % PAGE_SIZE == 0); + len = EUFS_SYMLINK_HASHLEN_LEN(*((u64 *)root)); + NV_ASSERT(len <= EUFS_MAX_SYMLINK_LEN); + BUG_ON(len > EUFS_MAX_SYMLINK_LEN); + eufs_flush_range(root, EUFS_SYMLINK_SIZE(len)); +} + +static __always_inline void persist_page(const char *page) +{ + NV_ASSERT(page); + NV_ASSERT(((u64)page) % PAGE_SIZE == 0); + eufs_flush_page(page); +} + +#endif /* EUFS_DEP_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement multiple inode related interfaces, eufs_iget, eufs_put_inode, etc.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: sunqiuyang sunqiuyang@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/inode.c | 602 +++++++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/inode.h | 44 ++++ 2 files changed, 646 insertions(+) create mode 100644 fs/eulerfs/inode.c create mode 100644 fs/eulerfs/inode.h
diff --git a/fs/eulerfs/inode.c b/fs/eulerfs/inode.c new file mode 100644 index 000000000000..c3db0750b66f --- /dev/null +++ b/fs/eulerfs/inode.c @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/aio.h> +#include <linux/sched.h> +#include <linux/highuid.h> +#include <linux/module.h> +#include <linux/mpage.h> +#include <linux/backing-dev.h> +#include <linux/types.h> +#include <linux/ratelimit.h> +#include <linux/dax.h> +#include "euler.h" +#include "dax.h" +#include "dht.h" +#include "dep.h" + +static int eufs_read_pinode(struct inode *inode, struct eufs_inode *pi) +{ + int ret = -EIO; + struct eufs_inode_info *vi = EUFS_I(inode); + struct super_block *sb = inode->i_sb; + umode_t mode; + u64 blocks; + u64 encoded_root; + + eufs_dbg("%s: inode=%px pi=%px, pi->i_mode=%x\n", __func__, inode, + pi, eufs_iread_mode(pi)); + + pi = EUFS_FRESH_PI(pi); + + eufs_set_inode_flags(inode, eufs_iread_flags(pi)); + mode = eufs_iread_mode(pi); + inode->i_mode = mode; + vi->i_version = eufs_iread_version(pi); + inode->i_ctime.tv_sec = eufs_iread_ctime(pi); + inode->i_ctime.tv_nsec = eufs_iread_ctime_nsec(pi); + i_uid_write(inode, eufs_iread_uid(pi)); + i_gid_write(inode, eufs_iread_gid(pi)); + vi->i_dotdot = eufs_iread_dotdot(pi); + vi->i_ext = eufs_iread_ext(pi); + + inode->i_generation = eufs_iread_generation(pi); + set_nlink(inode, eufs_iread_nlink(pi)); + inode->i_mtime.tv_sec = eufs_iread_mtime(pi); + inode->i_atime.tv_sec = eufs_iread_atime(pi); + inode->i_mtime.tv_nsec = eufs_iread_mtime_nsec(pi); + inode->i_atime.tv_nsec = eufs_iread_atime_nsec(pi); + + inode->i_size = eufs_iread_size(pi); + + blocks = 0; + switch (mode & S_IFMT) { + case S_IFDIR: + vi->i_dotdot = eufs_iread_dotdot(pi); + vi->i_volatile_root = NULL; + vi->i_volatile_height = 0; + blocks = 1; + break; + case S_IFREG: + vi->i_volatile_tree_blocks = eufs_iread_tree_blocks(pi); + eufs_alloc_batch_init(&vi->page_batch, 2); + fallthrough; + case S_IFLNK: + encoded_root = eufs_iread_root(pi); + vi->i_volatile_root = o2p(sb, root_ptr(encoded_root)); + vi->i_volatile_height = root_height(encoded_root); + + if (S_ISREG(mode)) + /* These blocks contain hole as well */ + blocks = vi->i_volatile_tree_blocks; + else + blocks = 1; + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = eufs_iread_rdev(pi); + break; + } + + /* check if the inode is active. */ + if (inode->i_nlink == 0) { + /* this inode is deleted */ + ret = -ESTALE; + goto bad_inode; + } + + inode->i_blocks = blocks << (inode->i_blkbits - 9); + + inode->i_mapping->a_ops = &eufs_aops; + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_op = &eufs_file_inode_operations; + inode->i_fop = &eufs_file_operations; + break; + case S_IFDIR: + inode->i_op = &eufs_dir_inode_operations; + inode->i_fop = &eufs_dir_operations; + break; + case S_IFLNK: + inode->i_op = &eufs_symlink_inode_operations; + break; + default: + inode->i_size = 0; + inode->i_op = &eufs_special_inode_operations; + init_special_inode(inode, inode->i_mode, eufs_iread_rdev(pi)); + break; + } + + return 0; + +bad_inode: + make_bad_inode(inode); + return ret; +} + +void eufs_sync_pinode(struct inode *inode, struct eufs_inode *pi, bool evict) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + struct super_block *sb = inode->i_sb; + u64 pi_root_o; + u64 pi_tree_blocks; + struct eufs_inode __pmem *twin_pi = EUFS_TWIN_PI(pi); + bool new = false; + + BUG_ON(!pi); + BUG_ON(!inode); + BUG_ON(!evict && !inode_is_locked(inode)); + + if (!inode->i_nlink) + return; + + /* let pi be the latest pinode */ + if (!pi->i_fresh || !twin_pi->i_fresh) + new = true; + + if (pi->i_fresh < twin_pi->i_fresh || (new && (pi > twin_pi))) { + struct eufs_inode *t = pi; + + pi = twin_pi; + twin_pi = t; + } + + pi_root_o = eufs_iread_root(pi); + + pi_tree_blocks = eufs_iread_tree_blocks(pi); + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + break; + case S_IFREG: + BUG_ON(!evict && !inode_is_locked(inode)); + + if (vi->i_volatile_tree_blocks > pi_tree_blocks) { + /* For a newly created pi, this is always true */ + void __pmem *root = vi->i_volatile_root; + int height = vi->i_volatile_height; + + BUG_ON(root_height(pi_root_o) > vi->i_volatile_height); + + eufs_alloc_batch_persist_reset(sb, &vi->page_batch); + + eufs_persist_btree( + sb, root, height, pi_tree_blocks * PAGE_SIZE, + vi->i_volatile_tree_blocks * PAGE_SIZE); + + } else { + eufs_alloc_batch_persist_reset(sb, &vi->page_batch); + } + pi_root_o = encode_root(p2o(sb, vi->i_volatile_root), + vi->i_volatile_height); + pi_tree_blocks = vi->i_volatile_tree_blocks; + break; + case S_IFLNK: + /* Never change */ + break; + case S_IFCHR: + case S_IFBLK: + pi_root_o = ((u64)inode->i_rdev << 32) | inode->i_rdev; + break; + } + if (!evict && !inode_is_locked(inode)) { + eufs_info("! inode=%px\n", inode); + BUG(); + } + BUG_ON(!evict && !inode_is_locked(inode)); + + /* update to new data */ + eufs_iwrite_flags(twin_pi, eufs_get_inode_flags(inode, pi)); + eufs_iwrite_mode(twin_pi, inode->i_mode); + eufs_iwrite_version(twin_pi, 1); + eufs_iwrite_ctime(twin_pi, inode->i_ctime.tv_sec); + eufs_iwrite_ctime_nsec(twin_pi, inode->i_ctime.tv_nsec); + eufs_iwrite_uid(twin_pi, i_uid_read(inode)); + eufs_iwrite_gid(twin_pi, i_gid_read(inode)); + eufs_iwrite_dotdot(twin_pi, vi->i_dotdot); + eufs_iwrite_ext(twin_pi, vi->i_ext); /* no ext here */ + + eufs_iwrite_generation(twin_pi, inode->i_generation); + eufs_iwrite_nlink(twin_pi, inode->i_nlink); + eufs_iwrite_mtime(twin_pi, inode->i_mtime.tv_sec); + eufs_iwrite_atime(twin_pi, inode->i_atime.tv_sec); + eufs_iwrite_mtime_nsec(twin_pi, inode->i_mtime.tv_nsec); + eufs_iwrite_atime_nsec(twin_pi, inode->i_atime.tv_nsec); + eufs_iwrite_root(twin_pi, pi_root_o); + eufs_iwrite_size(twin_pi, inode->i_size); + eufs_iwrite_tree_blocks(twin_pi, pi_tree_blocks); + + eufs_flush_cacheline(twin_pi); + if (new) { + /* Handle new */ + pi->i_fresh = 1; + eufs_flush_cacheline(&pi->i_fresh); + twin_pi->i_fresh = 2; + } else if (unlikely(pi->i_fresh == U16_MAX)) { + /* Handle overflow */ + /* Invarient: pi should always be the freshest */ + /* freshness 0 is reserved for new inodes */ + twin_pi->i_fresh = 1; + eufs_flush_cacheline(&twin_pi->i_fresh); + pi->i_fresh = 2; + eufs_flush_cacheline(&pi->i_fresh); + twin_pi->i_fresh = 3; + } else { + /* Normal case */ + twin_pi->i_fresh = pi->i_fresh + 1; + } + /* This flush also flushes the bottom half of the twin_pi */ + eufs_flush_cacheline(&twin_pi->i_fresh); +} + +struct inode *eufs_iget(struct super_block *sb, struct eufs_inode *pi) +{ + struct inode *inode; + int err; + + WARN_ON(!EUFS_IS_HEAD_PI(pi)); + inode = iget_locked(sb, eufs_pi2ino(sb, pi)); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = eufs_read_pinode(inode, pi); + if (unlikely(err)) + goto fail; + + unlock_new_inode(inode); + return inode; +fail: + iget_failed(inode); + return ERR_PTR(err); +} + +void eufs_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct eufs_inode *pi = EUFS_PI(inode); + struct eufs_inode *fresh_pi; + struct eufs_inode_info *vi = EUFS_I(inode); + + eufs_dbg( + "Evicting: inode=%px, pi=%px i_nlink=%u inode->i_size=%lld blocks=%lld\n", + inode, pi, inode->i_nlink, inode->i_size, + vi->i_volatile_tree_blocks); + + if (!inode->i_nlink && !is_bad_inode(inode)) { + /* Free the inode */ + fresh_pi = EUFS_FRESH_PI(pi); + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + /* Directory can be removed only if the dict is empty */ + NV_ASSERT(!vi->i_volatile_root); + nv_free(sb, o2p(sb, eufs_iread_dict(fresh_pi))); + break; + case S_IFLNK: + NV_ASSERT(!vi->i_volatile_root); + nv_free(sb, o2p(sb, eufs_iread_root(fresh_pi))); + break; + case S_IFREG: + /* Traverse the B-tree! */ + eufs_free_btree(sb, vi->i_volatile_root, + vi->i_volatile_height, + vi->i_volatile_tree_blocks); + break; + default: + break; + } + eufs_iwrite_nlink(fresh_pi, 0); + eufs_iwrite_mode(fresh_pi, 0); + eufs_flush_cacheline(fresh_pi); + WARN_ON(!EUFS_IS_HEAD_PI(pi)); + nv_free(sb, pi); + } else if (!is_bad_inode(inode)) { + eufs_sync_pinode(inode, pi, true); + } + if (!is_bad_inode(inode) && vi->i_volatile_dict) { + eufs_free_page(vi->i_volatile_dict); + vi->i_volatile_dict = NULL; + } + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + return; +} + +int eufs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + inode_lock(inode); + eufs_sync_pinode(inode, EUFS_PI(inode), false); + inode_unlock(inode); + return 0; +} + +int eufs_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct eufs_inode *pi = EUFS_PI(inode); + int ret; + unsigned int ia_valid = attr->ia_valid; + + if (!pi) + return -EACCES; + + ret = setattr_prepare(dentry, attr); + if (ret) + return ret; + + if ((ia_valid & ATTR_SIZE) && attr->ia_size != inode->i_size) { + struct eufs_inode_info *vi = EUFS_I(inode); + bool shrink; + + eufs_dbg( + "notify change (size): vi=%px inode=%px, pi=%px (%lld), %lld to %lld\n", + vi, inode, pi, eufs_iread_size(pi), inode->i_size, + attr->ia_size); + + down_write(&vi->mmap_rwsem); + shrink = attr->ia_size < inode->i_size; + + if (attr->ia_size > inode->i_size) { + unsigned long num_blocks = + DIV_ROUND_UP(attr->ia_size, PAGE_SIZE); + /* make sure the file has enough pages allocated */ + ret = eufs_extend_btree(inode, num_blocks); + if (ret < 0) { + up_write(&vi->mmap_rwsem); + return ret; + } + + /* zeroing the extended range [i_size, ia_size) */ + eufs_inode_zero_range(inode, inode->i_size, + attr->ia_size); + } + + truncate_setsize(inode, attr->ia_size); + + attr->ia_valid = ia_valid | (ATTR_CTIME | ATTR_MTIME); + + if (shrink) + eufs_shrink_btree(inode); + + /* zeroing the part beyond the new EOF [ia_size, PAGE_ALIGN(ia_size)) */ + eufs_inode_zero_range(inode, attr->ia_size, + PAGE_ALIGN(attr->ia_size)); + + up_write(&vi->mmap_rwsem); + } + eufs_dbg("notify change: inode=%px, pi=%px, imode=%x to imode=%x\n", + inode, pi, inode->i_mode, attr->ia_mode); + setattr_copy(inode, attr); + + request_persistence(inode); + + return 0; +} + +int eufs_file_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode)); + unsigned int flags = eufs_get_inode_flags(inode, pi); + + flags &= FS_FL_USER_VISIBLE; + if (flags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + + stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE); + + generic_fillattr(inode, stat); + return 0; +} + +/* Transfer FS_*_FL to S_* and write to inode */ +void eufs_set_inode_flags(struct inode *inode, unsigned int flags) +{ + inode->i_flags &= + ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC); + if (flags & FS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + inode->i_flags |= S_DAX; +} + +/* Get S_* from inode and transfer to FS_*_FL */ +unsigned int eufs_get_inode_flags(struct inode *inode, struct eufs_inode *pi) +{ + unsigned int flags = inode->i_flags; + unsigned int eufs_flags = eufs_iread_flags(EUFS_FRESH_PI(pi)); + + eufs_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL | + FS_NOATIME_FL | FS_DIRSYNC_FL); + if (flags & S_SYNC) + eufs_flags |= FS_SYNC_FL; + if (flags & S_APPEND) + eufs_flags |= FS_APPEND_FL; + if (flags & S_IMMUTABLE) + eufs_flags |= FS_IMMUTABLE_FL; + if (flags & S_NOATIME) + eufs_flags |= FS_NOATIME_FL; + if (flags & S_DIRSYNC) + eufs_flags |= FS_DIRSYNC_FL; + + return eufs_flags; +} + +static int eufs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct dax_device *dax_dev = NULL; + int ret = 0; + + /* Only for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EIO; + + dax_dev = EUFS_SB(inode->i_sb)->s_dax_dev; + ret = dax_writeback_mapping_range(mapping, dax_dev, wbc); + + return ret; +} + +const struct address_space_operations eufs_aops = { + .writepages = eufs_writepages, +}; + +struct inode *pre_inodes_get(struct dentry *dentry, struct inode *dir, + umode_t mode, bool special, dev_t rdev) +{ + struct inode *inode = NULL; + struct eufs_inode __pmem *pi; + struct super_block *sb = dir->i_sb; + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct eufs_inode __pmem *dir_pi = EUFS_FRESH_PI(EUFS_PI(dir)); + struct eufs_inode_info *vi; + void *pre_page = NULL; + int err; + u64 blocks; + + NV_ASSERT(dir_pi); + + inode = new_inode(sb); + if (IS_ERR(inode)) + return inode; + + vi = EUFS_I(inode); + vi->i_volatile_dict = NULL; + + BUG_ON(inode->i_nlink != 1); + inode->i_size = 0; + vi->i_ext = 0; + vi->i_dotdot = 0; + vi->i_version = 1; + + pi = eufs_malloc_pinode(sb); + if (!pi) + goto no_space_err; + + pi->i_fresh = 0; + EUFS_TWIN_PI(pi)->i_fresh = 0; + + blocks = 0; + if (S_ISREG(mode)) { + pre_page = eufs_malloc_file_data(sb); + if (!pre_page) + goto no_space_err; + blocks = 1; + } else if (S_ISLNK(mode)) { + pre_page = eufs_zalloc_symlink(sb); + if (!pre_page) + goto no_space_err; + blocks = 1; + } else if (S_ISDIR(mode)) { + pre_page = eufs_zalloc_htable(sb); + if (!pre_page) + goto no_space_err; + blocks = 1; + } + inode->i_blocks = blocks << (inode->i_blkbits - 9); + + eufs_dbg("bind inode(%px) ->pi(%px)->i_ino=0x%lx, vi->trans=%d\n", + inode, pi, eufs_pi2ino(sb, pi), vi->i_lock_transferred); + inode->i_ino = eufs_pi2ino(sb, pi); + + inode_init_owner(inode, dir, mode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_generation = atomic_add_return(1, &sbi->next_generation); + + if (special) + init_special_inode(inode, mode, rdev); + + eufs_iwrite_root(pi, EUFS_POISON_VALUE); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + eufs_iwrite_rdev(pi, inode->i_rdev); + } else { + vi->i_volatile_height = 0; + if (S_ISREG(inode->i_mode)) { + vi->i_volatile_tree_blocks = 1; + eufs_iwrite_tree_blocks(pi, 0); + vi->i_volatile_root = pre_page; + /* 0th block is treated as a hole until allocated. */ + vi->hole_at_sta = true; + eufs_iwrite_root( + pi, encode_root(p2o(sb, vi->i_volatile_root), + vi->i_volatile_height)); + eufs_alloc_batch_init(&vi->page_batch, 2); + eufs_alloc_batch_add(sb, &vi->page_batch, + vi->i_volatile_root); + + } else if (S_ISDIR(inode->i_mode)) { + vi->i_volatile_dict = NULL; + eufs_iwrite_dict(pi, p2o(sb, pre_page)); + /* allocation persisted in do_dep_diradd */ + } else if (S_ISLNK(inode->i_mode)) { + eufs_iwrite_root(pi, p2o(sb, pre_page)); + /* allocation persisted in do_dep_diradd */ + } + } + + eufs_iwrite_mode(pi, inode->i_mode); + eufs_iwrite_size(pi, 0); + + eufs_dbg( + "alloc inode=%px pi=%px pi->root=0x%llx pi->i_mode=0%o on cpu %d\n", + inode, pi, eufs_iread_root(pi), eufs_iread_mode(pi), + smp_processor_id()); + + eufs_iwrite_flags(pi, dir_pi->i_flags); + eufs_set_inode_flags(inode, eufs_iread_flags(pi)); + + err = insert_inode_locked(inode); + if (err) { + eufs_err(sb, "eufs_new_inode failed ino 0x%lx err %d\n", + inode->i_ino, err); + goto out; + } + + return inode; + +no_space_err: + err = -ENOSPC; +out: + if (pre_page) + nv_free(sb, pre_page); + if (pi) + nv_free(sb, pi); + if (inode) { + make_bad_inode(inode); + inode->i_ino = 0; + iput(inode); + } + return ERR_PTR(err); +} + +void eufs_inode_size_write(struct inode *inode, loff_t new_size) +{ + i_size_write(inode, new_size); + request_persistence(inode); +} diff --git a/fs/eulerfs/inode.h b/fs/eulerfs/inode.h new file mode 100644 index 000000000000..ef6f6e39a340 --- /dev/null +++ b/fs/eulerfs/inode.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_INODE_H +#define EUFS_INODE_H + +extern struct inode *eufs_iget(struct super_block *sb, struct eufs_inode *pi); + +extern void eufs_put_inode(struct inode *inode); + +extern void eufs_evict_inode(struct inode *inode); + +extern int eufs_write_inode(struct inode *inode, + struct writeback_control *wbc); + +extern int eufs_notify_change(struct dentry *dentry, struct iattr *attr); + +extern int eufs_file_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags); + +extern void eufs_set_inode_flags(struct inode *inode, unsigned int flags); + +extern unsigned int eufs_get_inode_flags(struct inode *inode, + struct eufs_inode *pi); + +extern void eufs_sync_pinode(struct inode *inode, struct eufs_inode *pi, + bool evict); + +extern struct inode *pre_inodes_get(struct dentry *dentry, struct inode *dir, + umode_t mode, bool special, dev_t rdev); + +extern void eufs_inode_size_write(struct inode *inode, loff_t new_size); + +#endif /* EUFS_INODE_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement dax operations, read, write, etc.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/dax.c | 1696 ++++++++++++++++++++++++++++++++++++++++++++++ fs/eulerfs/dax.h | 101 +++ 2 files changed, 1797 insertions(+) create mode 100644 fs/eulerfs/dax.c create mode 100644 fs/eulerfs/dax.h
diff --git a/fs/eulerfs/dax.c b/fs/eulerfs/dax.c new file mode 100644 index 000000000000..9ec8ad713fd9 --- /dev/null +++ b/fs/eulerfs/dax.c @@ -0,0 +1,1696 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/pfn_t.h> +#include <linux/buffer_head.h> +#include <linux/iomap.h> +#include <linux/dax.h> +#include <linux/cpufeature.h> +#include <linux/pgtable.h> +#include "euler.h" +#include "dax.h" +#include "dep.h" +#include "wear.h" +#include "alloc_interface.h" + +int eufs_persist_btree_node(void *root, int sta, int len); + +static __always_inline void eufs_clear_pmem(void *addr, size_t size) +{ + memset(addr, 0, size); + eufs_flush_range(addr, size); +} + +static __always_inline void *eufs_find_data_block_btree(struct inode *inode, + unsigned long blocknr, + __le64 **parent) +{ + __le64 *bp; + u32 height, bit_shift; + unsigned int idx; + struct eufs_inode_info *vi = EUFS_I(inode); + + /* inode must be a regular file */ + height = vi->i_volatile_height; + bp = vi->i_volatile_root; + + NV_ASSERT(blocknr < (1UL << (height * EUFS_FILE_TREE_DEGREE_SHIFT))); + + if (height == 0) { + BUG_ON(blocknr != 0); + if (parent) + *parent = NULL; + return (void *)bp; + } + + if (height == 1) { + if (bp[blocknr] == NULL_VAL) + BUG(); + + if (parent) + *parent = bp; + return s2p(inode->i_sb, bp[blocknr]); + } + while (height > 0) { + bit_shift = (height - 1) * EUFS_FILE_TREE_DEGREE_SHIFT; + idx = blocknr >> bit_shift; + if (parent) + *parent = bp; + bp = s2p(inode->i_sb, bp[idx]); + if (bp == 0) + return 0; + blocknr = blocknr & ((1 << bit_shift) - 1); + height--; + } + return bp; +} + +static int eufs_extend_btree_recursive_blind(struct inode *inode, + int level_left, __le64 *parent, + int sta_index, + int end_index, /* inclusive */ + struct alloc_batch *ab) +{ + struct super_block *sb = inode->i_sb; + void *p; + long r; + int i; + + for (i = sta_index; i <= end_index; ++i) { + if (!level_left) { + parent[i] = NULL_ADDR; + continue; + } + /* level_left */ + p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); + if (!p) + return -ENOSPC; + parent[i] = p2s(sb, p); + /* recur */ + r = eufs_extend_btree_recursive_blind(inode, level_left - 1, p, + 0, + EUFS_FILE_TREE_DEGREE - 1, + ab); + if (IS_ERR_VALUE(r)) + return r; + } + return 0; +} + +/* + * Allocate blocks from top to bottom. + * + * Only allocate the interior blocks which will have a leaf child block or + * an interior child block. And the unused pointers for children will NOT + * be zeroed. + * + * New leaf blocks are not allocated, and their values are set to NULL_ADDR. + */ +static int eufs_extend_btree_recursive(struct inode *inode, int level_left, + __le64 *parent, unsigned long origin, + unsigned long num_blocks, + struct alloc_batch *ab, bool blind) +{ + struct super_block *sb = inode->i_sb; + const unsigned long nblocks_per_slot = + 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT); + unsigned long off; + int sta_index, end_index; + int i; + long r; + void *p; + + if (blind) { + return eufs_extend_btree_recursive_blind( + inode, level_left, parent, 0, EUFS_FILE_TREE_DEGREE - 1, + ab); + } + + if (origin == 0) { + /* end_index could be zero */ + end_index = (num_blocks - 1) / nblocks_per_slot; + r = eufs_extend_btree_recursive_blind(inode, level_left, parent, + 0, end_index - 1, ab); + if (IS_ERR_VALUE(r)) + return r; + if (!level_left) { + parent[end_index] = NULL_ADDR; + } else { + p = eufs_alloc_batch_allocate_file_index(inode->i_sb, + ab); + if (!p) + return -ENOSPC; + parent[end_index] = p2s(sb, p); + off = nblocks_per_slot * end_index; + r = eufs_extend_btree_recursive(inode, level_left - 1, + p, 0, num_blocks - off, + ab, false); + if (IS_ERR_VALUE(r)) + return r; + } + return 0; + } + + sta_index = (origin - 1) / nblocks_per_slot; + end_index = (num_blocks - 1) / nblocks_per_slot; + + /* + * No need to create a new sub-tree, so descend to the sub-tree + * rooted in parent[sta_index] + */ + if (sta_index == end_index) { + if (!level_left) + return 0; + + /* calculate the needed block count in the sub-tree */ + off = sta_index * nblocks_per_slot; + r = eufs_extend_btree_recursive(inode, level_left - 1, + s2p(sb, parent[sta_index]), + origin - off, num_blocks - off, + ab, false); + if (IS_ERR_VALUE(r)) + return r; + return 0; + } + + if (!level_left) { + for (i = sta_index + 1; i <= end_index; ++i) + parent[i] = NULL_ADDR; + + return 0; + } + + /* extend sub-tree shared with existed blocks to its maximum size */ + off = sta_index * nblocks_per_slot; + r = eufs_extend_btree_recursive(inode, level_left - 1, + s2p(sb, parent[sta_index]), + origin - off, nblocks_per_slot, ab, + false); + if (IS_ERR_VALUE(r)) + return r; + + /* new sub-trees which will be fully initialized */ + r = eufs_extend_btree_recursive_blind(inode, level_left, parent, + sta_index + 1, end_index - 1, ab); + if (IS_ERR_VALUE(r)) + return r; + + /* the last new sub-tree which may only needs partial initialization */ + p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); + if (!p) + return -ENOSPC; + parent[end_index] = p2s(sb, p); + + off = end_index * nblocks_per_slot; + r = eufs_extend_btree_recursive(inode, level_left - 1, p, 0, + num_blocks - off, ab, false); + if (IS_ERR_VALUE(r)) + return r; + return 0; +} + +static unsigned long eufs_count_pages(unsigned long leaf_blocks) +{ + unsigned long tot = leaf_blocks; + + while (leaf_blocks > 1) { + leaf_blocks = DIV_ROUND_UP(leaf_blocks, EUFS_FILE_TREE_DEGREE); + tot += leaf_blocks; + } + return tot; +} + +/* So that we have page[0..num_blocks-1] */ +int eufs_extend_btree(struct inode *inode, unsigned long num_blocks) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + unsigned long full_size; + unsigned long need_blocks; + __le64 *new_root; + long r = 0; + struct alloc_batch *ab = &vi->page_batch; + + if (!num_blocks) + return 0; + if (vi->i_volatile_tree_blocks >= num_blocks) + /* already allocated */ + return 0; + if (num_blocks > inode->i_sb->s_maxbytes >> EUFS_BLOCK_SIZE_BITS) + return -EFBIG; + + /* Grow from vi->i_volatile_tree_blocks to num_blocks */ + need_blocks = eufs_count_pages(num_blocks) - + eufs_count_pages(vi->i_volatile_tree_blocks); + + /* Set NULL_ADDR for extended data blocks */ + need_blocks -= (num_blocks - vi->i_volatile_tree_blocks); + + r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); + if (IS_ERR_VALUE(r)) + return r; + + BUG_ON(!vi->i_volatile_root); + if (!vi->i_volatile_root) { + vi->i_volatile_root = + eufs_alloc_batch_allocate_file_data(inode->i_sb, ab); + BUG_ON(!vi->i_volatile_root); + vi->i_volatile_height = 0; + } + if (num_blocks == 1) { + /* Already allocated */ + goto out; + } + full_size = 1UL + << (vi->i_volatile_height * EUFS_FILE_TREE_DEGREE_SHIFT); + while (full_size < num_blocks) { + new_root = + eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); + new_root[0] = p2s(inode->i_sb, vi->i_volatile_root); + vi->i_volatile_root = new_root; + vi->i_volatile_height++; + full_size <<= EUFS_FILE_TREE_DEGREE_SHIFT; + } + BUG_ON(vi->i_volatile_height < 1); + r = eufs_extend_btree_recursive(inode, vi->i_volatile_height - 1, + vi->i_volatile_root, + vi->i_volatile_tree_blocks, num_blocks, + ab, false); +out: + eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); + vi->i_volatile_tree_blocks = num_blocks; + num_blocks <<= (inode->i_blkbits - 9); + if (num_blocks > inode->i_blocks) + inode->i_blocks = num_blocks; + return r; +} + +int eufs_alloc_blocks_btree(struct inode *inode, unsigned long start_block, + unsigned long num_blocks, int zero) +{ + long r; + unsigned long blocknr, need_blocks = 0, + end_block = start_block + num_blocks; + long pi_tree_blocks = + eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode))); + struct eufs_inode_info *vi = EUFS_I(inode); + struct alloc_batch *ab = &vi->page_batch; + __le64 *parent; + unsigned int ofs; + void *xmem; + int last_ofs_line = -1; + + r = eufs_extend_btree(inode, start_block + num_blocks); + if (r) + return r; + + if (start_block == 0) + vi->hole_at_sta = false; + + /* The 0th data block is always allocated. */ + blocknr = start_block ? start_block : 1; + + /* TODO: need optimization. */ + while (blocknr < end_block) { + eufs_find_data_block_btree(inode, blocknr, &parent); + BUG_ON(!parent); + ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); + while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) { + if (parent[ofs] == NULL_ADDR) { + /* + * The leaf blocks are not allocated before persist, + * e.g. through truncate() + fsync() + */ + if (blocknr < pi_tree_blocks) { + xmem = eufs_zalloc_file_data( + inode->i_sb); + if (!xmem) + return -ENOSPC; + eufs_alloc_persist(inode->i_sb, xmem, + false); + eufs_flush_page(xmem); + parent[ofs] = p2s(inode->i_sb, xmem); + eufs_flush_cacheline(&parent[ofs]); + + invalidate_inode_pages2_range( + inode->i_mapping, blocknr, + blocknr); + + } else + need_blocks++; + } + ofs++; + blocknr++; + } + } + + if (!need_blocks) + return 0; + + /* TODO: need optimization. */ + r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); + if (IS_ERR_VALUE(r)) + return r; + + blocknr = start_block ? start_block : 1; + while (blocknr < end_block) { + eufs_find_data_block_btree(inode, blocknr, &parent); + BUG_ON(!parent); + last_ofs_line = -1; + ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); + while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) { + if (parent[ofs] == NULL_ADDR) { + xmem = eufs_alloc_batch_allocate_file_data( + inode->i_sb, ab); + if (zero == EUFS_ALLOC_BLOCKS_ZERO_ALL || + ((zero == EUFS_ALLOC_BLOCKS_ZERO_EDGE) && + (blocknr == start_block || + blocknr == end_block - 1))) + eufs_clear_pmem(xmem, PAGE_SIZE); + + parent[ofs] = p2s(inode->i_sb, xmem); + + invalidate_inode_pages2_range(inode->i_mapping, + blocknr, blocknr); + + if (last_ofs_line == -1) + last_ofs_line = + (ofs >> + EUFS_PTR_CNT_SHIFT_PER_CACHELINE); + } + ofs++; + if (last_ofs_line != -1 && + (ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE) != + last_ofs_line) { + eufs_flush_cacheline(&parent[ofs - 1]); + last_ofs_line = -1; + } + blocknr++; + } + if (last_ofs_line != -1) + eufs_flush_cacheline(&parent[ofs - 1]); + } + + eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); + return r; +} + +static int eufs_alloc_blocks_btree_for_write(struct inode *inode, loff_t pos, + int len) +{ + long r; + unsigned long blocknr, need_blocks = 0; + long pi_tree_blocks = + eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode))); + size_t file_size_block = PAGE_DIV_ROUND_UP(inode->i_size); + struct eufs_inode_info *vi = EUFS_I(inode); + struct alloc_batch *ab = &vi->page_batch; + __le64 *parent; + void *xmem; + /* The page first byte resides in */ + unsigned long first_page = PAGE_DIV_ROUND_DOWN(pos); + /* The page last byte resides in */ + unsigned long last_page = PAGE_DIV_ROUND_DOWN(pos + len - 1); + unsigned long pending_flush_bits; + int start_offset; + int end_offset; + int ofs; + + r = eufs_extend_btree(inode, last_page + 1); + if (r) + return r; + + /* hole_at_sta is used by SEEK_HOLE. */ + /* FIXME: We need a durable way to present hole_at_sta. */ + if (first_page == 0) + vi->hole_at_sta = false; + + /* The 0th data block is always allocated. */ + blocknr = first_page ? first_page : 1; + + /* + * Can be optimized by saving the top-down parent pointers + * in a cursor and advancing by moving the cursor + */ + while (blocknr <= last_page) { + eufs_find_data_block_btree(inode, blocknr, &parent); + BUG_ON(!parent); + + /* One ofs, one block */ + for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); + ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page; + ++ofs, ++blocknr) { + /* Not a hole */ + if (parent[ofs] != NULL_ADDR) + continue; + + /* Hole */ + if (blocknr < pi_tree_blocks) { + /* + * TODO: optimize option, instead of wrting + * zeros here, we can write the actual data + * instead. + */ + xmem = eufs_zalloc_file_data(inode->i_sb); + if (!xmem) + return -ENOSPC; + eufs_alloc_persist(inode->i_sb, xmem, false); + eufs_flush_page(xmem); + parent[ofs] = p2s(inode->i_sb, xmem); + eufs_flush_cacheline(&parent[ofs]); + + invalidate_inode_pages2_range(inode->i_mapping, + blocknr, blocknr); + + } else + need_blocks++; + } + } + + if (!need_blocks) + return 0; + + /* FIXME: This requries re-write */ + r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); + if (IS_ERR_VALUE(r)) + return r; + + start_offset = pos & (PAGE_SIZE - 1); + end_offset = (pos + len) & (PAGE_SIZE - 1); + blocknr = first_page ? first_page : 1; + while (blocknr <= last_page) { + unsigned long bit; + + eufs_find_data_block_btree(inode, blocknr, &parent); + + BUG_ON(!parent); + + /* No cacheline is pending to be flushed for this index block */ + pending_flush_bits = 0; + + for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); + ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page; + ++ofs, ++blocknr) { + /* Not a hole */ + if (parent[ofs] != NULL_ADDR) + continue; + + xmem = eufs_alloc_batch_allocate_file_data(inode->i_sb, + ab); + if (unlikely(blocknr == first_page && + (start_offset != 0))) + eufs_clear_pmem(xmem, start_offset); + + /* Do not clear the last block which is after the EOF-block */ + if (unlikely(blocknr == last_page && + (end_offset != 0) && + blocknr < file_size_block)) + eufs_clear_pmem((char *)xmem + end_offset, + PAGE_SIZE - end_offset); + + parent[ofs] = p2s(inode->i_sb, xmem); + + invalidate_inode_pages2_range(inode->i_mapping, blocknr, + blocknr); + set_bit(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE, + &pending_flush_bits); + } + + for (bit = find_first_bit(&pending_flush_bits, 64); bit < 64; + bit = find_next_bit(&pending_flush_bits, 64, bit + 1)) { + ofs = bit << EUFS_PTR_CNT_SHIFT_PER_CACHELINE; + eufs_flush_cacheline(&parent[ofs]); + } + } + + eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); + return r; +} + +static void eufs_free_recursive_btree_blind(struct super_block *sb, + __le64 *root, int level_left) +{ + int i; + + BUG_ON(!root); + if (level_left == -1) { + if (root != NULL_ADDR_PTR) + nv_zfree(sb, root); + return; + } + /* level_left */ + BUG_ON(root == NULL_ADDR_PTR); + for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) { + eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]), + level_left - 1); + } + nv_zfree(sb, root); +} + +static void eufs_free_recursive_btree(struct super_block *sb, __le64 *root, + int level_left, u64 blocks_left) +{ + u64 nblocks_per_slot; + int i; + + BUG_ON(!root); + BUG_ON(!blocks_left); + if (level_left == -1) { + if (root != NULL_ADDR_PTR) + nv_zfree(sb, root); + return; + } + /* level_left */ + BUG_ON(root == NULL_ADDR_PTR); + nblocks_per_slot = 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT); + for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) { + if (blocks_left >= nblocks_per_slot) { + /* the whole sub-tree needs to be freed */ + eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]), + level_left - 1); + blocks_left -= nblocks_per_slot; + if (blocks_left == 0) + break; + } else { + eufs_free_recursive_btree(sb, s2p(sb, root[i]), + level_left - 1, blocks_left); + break; + } + } + nv_zfree(sb, root); +} + +int eufs_shrink_btree(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct eufs_inode_info *vi = EUFS_I(inode); + struct eufs_inode *pi = EUFS_PI(inode); + + void *root; + + u64 capacity; + __le64 *new_root; + __le64 *old_root; + int new_height; + u64 size; + u64 blocks; + u64 count; + u64 blocks_left; + u64 __maybe_unused pi_root_o; + + BUG_ON(!inode_is_locked(inode)); + BUG_ON(vi->i_volatile_height > EUFS_MAX_FILE_TREE_HEIGHT); + + pi_root_o = eufs_iread_root(pi); + eufs_dbg("shrink btree stage 1: pi=%px vi->{s=%lld b=%lld h=%d r=%px} pi->{s=%lld b=%lld h=%d r=0x%llx}\n", + pi, inode->i_size, vi->i_volatile_tree_blocks, + vi->i_volatile_height, vi->i_volatile_root, + eufs_iread_size(pi), eufs_iread_tree_blocks(pi), + root_height(pi_root_o), root_ptr(pi_root_o)); + eufs_sync_pinode(inode, pi, false); + + capacity = PAGE_SIZE + << (EUFS_FILE_TREE_DEGREE_SHIFT * vi->i_volatile_height); + new_root = vi->i_volatile_root; + old_root = vi->i_volatile_root; + new_height = vi->i_volatile_height; + size = inode->i_size == 0 ? 1 : inode->i_size; + + /* old block count */ + blocks = vi->i_volatile_tree_blocks; + + /* Check whether the height should be reduced */ + for (;;) { + capacity >>= EUFS_FILE_TREE_DEGREE_SHIFT; + if (capacity < size || capacity < PAGE_SIZE) + break; + new_root = s2p(sb, new_root[0]); + new_height--; + } + vi->i_volatile_root = new_root; + vi->i_volatile_height = new_height; + vi->i_volatile_tree_blocks = DIV_ROUND_UP(size, PAGE_SIZE); + + eufs_sync_pinode(inode, pi, false); + + eufs_alloc_batch_persist_reset(sb, &vi->page_batch); + + /* new block count and it's greater than 0 */ + count = blocks_left = vi->i_volatile_tree_blocks; + + /* shrink from old_root/_height to new_root/_height */ + root = old_root; + + if (blocks_left == blocks) + goto out; + + if (blocks == 1) + goto out; + + if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { + int i; + + __le64 *proot = root; + + for (i = count; i < blocks; ++i) { + nv_free(sb, s2p(sb, proot[i])); + proot[i] = NULL_VAL; + } + goto out; + } + + if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { + int i; + __le64 *proot = root; + + for (i = EUFS_H2_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE; + ++i) { + __le64 *pages = s2p(sb, proot[i]); + int j = EUFS_H2_INDEX_IN_L1(count); + + for (; j < EUFS_FILE_TREE_DEGREE; ++j) { + nv_free(sb, s2p(sb, pages[j])); + pages[j] = NULL_VAL; + count++; + if (count >= blocks) + break; + } + if (EUFS_H2_IS_FREE_L1_SUBTREE(i, blocks_left)) { + nv_free(sb, pages); + proot[i] = NULL_VAL; + } + if (count >= blocks) + break; + } + goto out; + } + + if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + int i, j, k; + __le64 *pproot = root; + + for (i = EUFS_H3_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE; + ++i) { + __le64 *proot = s2p(sb, pproot[i]); + + j = EUFS_H3_INDEX_IN_L1(i, count); + for (; j < EUFS_FILE_TREE_DEGREE; ++j) { + __le64 *pages = s2p(sb, proot[j]); + + k = EUFS_H3_INDEX_IN_L2(count); + for (; k < EUFS_FILE_TREE_DEGREE; ++k) { + nv_free(sb, s2p(sb, pages[k])); + pages[k] = NULL_VAL; + count++; + if (count >= blocks) + break; + } + if (EUFS_H3_IS_FREE_L2_SUBTREE(i, j, + blocks_left)) { + nv_free(sb, pages); + proot[j] = NULL_VAL; + } + if (count >= blocks) + break; + } + if (EUFS_H3_IS_FREE_L1_SUBTREE(i, blocks_left)) { + nv_free(sb, proot); + pproot[i] = NULL_VAL; + } + if (count >= blocks) + break; + } + } + +out: + while (old_root != new_root) { + __le64 *r = old_root; + + BUG_ON(!r); + old_root = s2p(sb, r[0]); + nv_free(sb, r); + } + + return 0; +} + +int eufs_free_btree(struct super_block *sb, void *root, int height, u64 blocks) +{ + NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT)); + eufs_dbg("nvfree tree root: %px\n", root); + if (blocks == 0) + return 0; + if (blocks == 1) { + /* height == 0 */ + nv_free(sb, root); + return 0; + } + eufs_free_recursive_btree(sb, (__le64 *)root, height - 1, blocks); + return 0; +} + +int eufs_persist_btree_node(void *root, int sta, int len) +{ + BUG_ON(len > EUFS_FILE_TREE_DEGREE); + BUG_ON(len < 0); + BUG_ON(sta + len > EUFS_FILE_TREE_DEGREE); + BUG_ON(sta + len < 0); + if (len == 0) + return 0; + eufs_ptr_fast_check(root); + eufs_flush_range(((void **)root) + sta, len * sizeof(void *)); + return 0; +} + +static void eufs_persist_btree_h2_subtree(struct super_block *sb, void *root, + int start0, int idx0, int idx1) +{ + __le64 *proot = root; + int i; + void *p; + + for (i = start0; i < idx0; ++i) { + BUG_ON(proot[i] == 0); + + p = s2p(sb, proot[i]); + eufs_ptr_fast_check(p); + eufs_persist_btree_node(p, 0, EUFS_FILE_TREE_DEGREE); + } + + /* + * According to the WARN_ON in eufs_persist_new_btree_h2, + * idx0 < EUFS_FILE_TREE_DEGREE if idx1 != 0. So the following code + * is safe. + */ + if (idx1 != 0) { + p = s2p(sb, proot[idx0]); + eufs_persist_btree_node(p, 0, idx1); + } +} + +static void eufs_persist_btree_h2_root(void *root, int start0, int idx0, + int idx1) +{ + int cnt = idx0 - start0; + + /* + * It's the L1 index of the next block, so when it's not equals with 0, + * the node[idx0] also needs persistence. + */ + if (idx1 != 0) + cnt++; + + eufs_persist_btree_node(root, start0, cnt); +} + +static void eufs_persist_new_btree_h2_by_idx(struct super_block *sb, void *root, + int start0, int idx0, int idx1) +{ + eufs_persist_btree_h2_subtree(sb, root, start0, idx0, idx1); + /* It's a new btree, so persist the whole root node */ + eufs_persist_btree_h2_root(root, 0, idx0, idx1); +} + +static void eufs_persist_new_btree_h2(struct super_block *sb, void *root, + int start0, unsigned long bcnt) +{ + /* the L0/L1 index of the next block in new tree with height 2 */ + int idx0 = EUFS_H2_INDEX_IN_L0(bcnt); + int idx1 = EUFS_H2_INDEX_IN_L1(bcnt); + + /* + * Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in + * which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0) + */ + WARN_ON(idx0 == EUFS_FILE_TREE_DEGREE && idx1); + + eufs_persist_new_btree_h2_by_idx(sb, root, start0, idx0, idx1); +} + +static void eufs_persist_inc_btree_h2_by_idx(struct super_block *sb, void *root, + int old_idx0, int old_idx1, + int new_idx0, int new_idx1) +{ + __le64 *proot = root; + void *p; + int start; + + p = s2p(sb, proot[old_idx0]); + if (old_idx0 == new_idx0) { + if (old_idx0 == EUFS_FILE_TREE_DEGREE) + return; + + eufs_persist_btree_node(p, old_idx1, new_idx1 - old_idx1); + + /* node[old_idx0] needs persistence */ + if (!old_idx1) + eufs_persist_btree_node(root, old_idx0, 1); + + return; + } + + eufs_persist_btree_node(p, old_idx1, EUFS_FILE_TREE_DEGREE - old_idx1); + + eufs_persist_btree_h2_subtree(sb, root, old_idx0 + 1, new_idx0, + new_idx1); + + start = old_idx0; + /* if old_idx0 is not 0, root[start] must have already been persisted */ + if (old_idx1) + start++; + eufs_persist_btree_h2_root(root, start, new_idx0, new_idx1); +} + +static void eufs_persist_inc_btree_h2(struct super_block *sb, void *root, + unsigned long old_bcnt, + unsigned long new_bcnt) +{ + /* the L0/L1 index of the next block in tree */ + int old_idx0 = EUFS_H2_INDEX_IN_L0(old_bcnt); + int old_idx1 = EUFS_H2_INDEX_IN_L1(old_bcnt); + int new_idx0 = EUFS_H2_INDEX_IN_L0(new_bcnt); + int new_idx1 = EUFS_H2_INDEX_IN_L1(new_bcnt); + + /* + * Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in + * which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0) + */ + WARN_ON(old_idx0 == EUFS_FILE_TREE_DEGREE && old_idx1); + WARN_ON(new_idx0 == EUFS_FILE_TREE_DEGREE && new_idx1); + + eufs_persist_inc_btree_h2_by_idx(sb, root, old_idx0, old_idx1, new_idx0, + new_idx1); +} + +static void eufs_persist_new_btree_h3(struct super_block *sb, void *root, + int start0, unsigned long bcnt_left) +{ + int i; + unsigned long left = bcnt_left; + __le64 *pproot = root; + + for (i = start0; i < EUFS_FILE_TREE_DEGREE; ++i) { + __le64 *proot = s2p(sb, pproot[i]); + int j; + + for (j = 0; j < EUFS_FILE_TREE_DEGREE; ++j) { + void *p = s2p(sb, proot[j]); + + if (left >= EUFS_FILE_TREE_DEGREE) { + eufs_persist_btree_node(p, 0, + EUFS_FILE_TREE_DEGREE); + left -= EUFS_FILE_TREE_DEGREE; + } else { + eufs_persist_btree_node(p, 0, left); + left = 0; + j++; + break; + } + } + + eufs_persist_btree_node(proot, 0, j); + if (!left) { + i++; + break; + } + } + + eufs_persist_btree_node(root, 0, i); +} + +static void eufs_persist_inc_btree_h3(struct super_block *sb, void *root, + unsigned long old_bcnt, + unsigned long new_bcnt) +{ + /* The L0/L1/L2 position of the next block in tree */ + int o_idx0 = EUFS_H3_INDEX_IN_L0(old_bcnt); + int o_idx1 = EUFS_H3_INDEX_IN_L1(o_idx0, old_bcnt); + int o_idx2 = EUFS_H3_INDEX_IN_L2(old_bcnt); + int n_idx0 = EUFS_H3_INDEX_IN_L0(new_bcnt); + int n_idx1 = EUFS_H3_INDEX_IN_L1(n_idx0, new_bcnt); + int n_idx2 = EUFS_H3_INDEX_IN_L2(new_bcnt); + __le64 *pproot = root; + __le64 *proot; + void *p; + int i; + + if (o_idx0 == n_idx0 && o_idx1 == n_idx1) { + /* persist from the bottom up */ + proot = s2p(sb, pproot[o_idx0]); + p = s2p(sb, proot[o_idx1]); + eufs_persist_btree_node(p, o_idx2, n_idx2 - o_idx2); + + /* node[o_idx1] needs persistence */ + if (!o_idx2) { + eufs_persist_btree_node(proot, o_idx1, 1); + + /* node[o_idx0] needs persistence */ + if (!o_idx1) + eufs_persist_btree_node(root, o_idx0, 1); + } + + return; + } + + if (o_idx0 == n_idx0) { + proot = s2p(sb, pproot[o_idx0]); + eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2, + n_idx1, n_idx2); + + /* node[o_idx0] needs persistence */ + if (!o_idx1 && !o_idx2) + eufs_persist_btree_node(root, o_idx0, 1); + + return; + } + + /* + * A corner case: o_idx1 == EUFS_FILE_TREE_DEGREE && o_idx2 == 0. This + * can be handled in the function eufs_persist_inc_btree_h2_by_idx, but + * we still check it here for efficiency. + */ + if (o_idx1 < EUFS_FILE_TREE_DEGREE) { + proot = s2p(sb, pproot[o_idx0]); + eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2, + EUFS_FILE_TREE_DEGREE, 0); + } else { + WARN_ON(o_idx2 != 0); + } + + for (i = o_idx0 + 1; i < n_idx0; ++i) { + proot = s2p(sb, pproot[i]); + eufs_persist_new_btree_h2_by_idx(sb, proot, 0, + EUFS_FILE_TREE_DEGREE, 0); + } + + if (n_idx1 || n_idx2) { + proot = s2p(sb, pproot[n_idx0]); + eufs_persist_new_btree_h2_by_idx(sb, proot, 0, n_idx1, n_idx2); + /* root[n_idx0] needs to be persisted */ + n_idx0++; + } + + /* root[o_idx0] has been persisted */ + if (o_idx1 || o_idx2) + o_idx0++; + + eufs_persist_btree_node(root, o_idx0, n_idx0 - o_idx0); +} + +/* Only structure persistency is needed */ +int eufs_persist_btree(struct super_block *sb, void *root, int height, + u64 old_size, u64 new_size) +{ + unsigned long old_nblocks, new_nblocks; + __le64 *proot; + __le64 *pproot; + + if (old_size == 0) + old_size = 1; /* at least one block */ + NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT)); + if (!root) + return 0; + /* don't support for persisting for shrink */ + if (old_size > new_size) + return 0; + old_nblocks = DIV_ROUND_UP(old_size, PAGE_SIZE); + new_nblocks = DIV_ROUND_UP(new_size, PAGE_SIZE); + if (old_nblocks == new_nblocks) + return 0; + proot = root; + if (old_nblocks == 1) { + /* data do not need flush */ + if (new_nblocks == 1) + return 0; + + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { + eufs_persist_btree_node(root, 0, new_nblocks); + return 0; + } + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { + eufs_persist_new_btree_h2(sb, root, 0, new_nblocks); + return 0; + } + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + eufs_persist_new_btree_h3(sb, root, 0, new_nblocks); + return 0; + } + } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { + eufs_persist_btree_node(root, old_nblocks, + new_nblocks - old_nblocks); + return 0; + } + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { + __le64 *p = s2p(sb, proot[0]); + + eufs_persist_btree_node(p, old_nblocks, + EUFS_FILE_TREE_DEGREE - + old_nblocks); + eufs_persist_new_btree_h2(sb, proot, 1, new_nblocks); + return 0; + } + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + void *p; + + pproot = root; + proot = s2p(sb, pproot[0]); + p = s2p(sb, proot[0]); + eufs_persist_btree_node(p, old_nblocks, + EUFS_FILE_TREE_DEGREE - + old_nblocks); + eufs_persist_new_btree_h2( + sb, proot, 1, EUFS_FILE_BCNT_WITH_HEIGHT(2)); + + eufs_persist_new_btree_h3( + sb, root, 1, + new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2)); + + return 0; + } + } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { + eufs_persist_inc_btree_h2(sb, root, old_nblocks, + new_nblocks); + return 0; + } + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + pproot = root; + proot = s2p(sb, pproot[0]); + eufs_persist_inc_btree_h2( + sb, proot, old_nblocks, + EUFS_FILE_BCNT_WITH_HEIGHT(2)); + + eufs_persist_new_btree_h3( + sb, root, 1, + new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2)); + + return 0; + } + } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { + eufs_persist_inc_btree_h3(sb, root, old_nblocks, + new_nblocks); + return 0; + } + } + BUG(); + return 0; +} + +static ssize_t do_mapping_read(struct address_space *mapping, + struct file_ra_state *_ra, struct file *filp, + char __user *buf, size_t len, loff_t *ppos) +{ + struct inode *inode = mapping->host; + pgoff_t index, end_index; + unsigned long offset; + loff_t isize, pos; + size_t copied = 0, error = 0; + + pos = *ppos; + index = pos >> PAGE_SHIFT; + offset = pos & ~PAGE_MASK; + + isize = i_size_read(inode); + if (!isize) + goto out; + + end_index = (isize - 1) >> PAGE_SHIFT; + do { + unsigned long nr, left; + void *xmem; + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_SIZE; + if (index >= end_index) { + if (index > end_index) + goto out; + + nr = ((isize - 1) & ~PAGE_MASK) + 1; + if (nr <= offset) + goto out; + } + nr = nr - offset; + if (nr > len - copied) + nr = len - copied; + + xmem = eufs_find_data_block_btree(inode, index, NULL); + + BUG_ON(!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)); + if (unlikely(!xmem)) + BUG(); + + /* + * Ok, we have the mem, so now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + if (xmem != NULL_ADDR_PTR) + left = __copy_to_user(buf + copied, xmem + offset, nr); + else + left = __clear_user(buf + copied, nr); + + if (left) { + error = -EFAULT; + goto out; + } + + copied += (nr - left); + offset += (nr - left); + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; + } while (copied < len); + +out: + *ppos = pos + copied; + if (filp) + file_accessed(filp); + + return copied ? copied : error; +} + +/* + * Wrappers. We need to use the rcu read lock to avoid + * concurrent truncate operation. No problem for write because we held + * i_mutex. + */ +ssize_t eufs_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *ppos) +{ + ssize_t res; + + inode_lock_shared(file_inode(filp)); + if (!access_ok(buf, len)) + res = -EFAULT; + else + res = do_mapping_read(filp->f_mapping, &filp->f_ra, filp, buf, + len, ppos); + inode_unlock_shared(file_inode(filp)); + return res; +} + +static __always_inline size_t memcpy_to_nvmm(char *kmem, loff_t offset, + const char __user *buf, + size_t bytes) +{ + size_t copied; + + if (support_clwb && !force_nocache_write) { + copied = bytes - __copy_from_user(kmem + offset, buf, bytes); + eufs_flush_buffer(kmem + offset, copied, 0); + } else { + copied = bytes - __copy_from_user_inatomic_nocache( + kmem + offset, buf, bytes); + } + + return copied; +} + +ssize_t __eufs_file_write_inode(struct inode *inode, const char __user *buf, + size_t count, loff_t pos, loff_t *ppos, + bool zero, bool keep) +{ + long status = 0; + size_t bytes; + ssize_t written = 0; + + if (!count) + return 0; + + eufs_dbg("file write: inode=%px count=%lx pos=%llx, zero=%d keep=%d\n", + inode, count, pos, zero, keep); + + do { + unsigned long index; + unsigned long offset; + size_t copied; + __le64 *parent; + void __pmem *xmem; + void __pmem *xmem_new = NULL; + + offset = (pos & (PAGE_SIZE - 1)); /* Within page */ + index = pos >> PAGE_SHIFT; + bytes = PAGE_SIZE - offset; + if (bytes > count) + bytes = count; + + xmem = eufs_find_data_block_btree(inode, index, &parent); + if (!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)) { + dump_stack(); + BUG(); + } + + /* do no wear leveling for 0-level btrees */ + if (xmem != NULL_ADDR_PTR && parent && !zero) { + /* wear threshold! */ + if (!wear_inc(inode->i_sb, xmem)) + xmem_new = eufs_malloc_file_data(inode->i_sb); + } + if (zero) { + copied = bytes; + if (xmem != NULL_ADDR_PTR) + eufs_clear_pmem((char *)xmem + offset, bytes); + } else { + BUG_ON(xmem == NULL_ADDR_PTR); + copied = memcpy_to_nvmm((char *)xmem, offset, buf, + bytes); + } + + if (xmem_new) { + struct eufs_inode_info *vi = EUFS_I(inode); + + eufs_dbg( + "inode=%px pos=%llx xmem:[%px -> %px] weared\n", + inode, pos, xmem, xmem_new); + eufs_alloc_persist(inode->i_sb, xmem_new, true); + + WARN_ON(xmem != + s2p(inode->i_sb, + parent[index % EUFS_FILE_TREE_DEGREE])); + + /* + * disable page fault, clear all related PTEs, and remove the + * dax entry from the radix tree before replace the old block + */ + down_write(&vi->mmap_rwsem); + invalidate_inode_pages2_range(inode->i_mapping, + pos / PAGE_SIZE, + pos / PAGE_SIZE); + memcpy_to_nvmm(xmem_new, 0, xmem, PAGE_SIZE); + parent[index % EUFS_FILE_TREE_DEGREE] = + p2s(inode->i_sb, xmem_new); + up_write(&vi->mmap_rwsem); + + eufs_flush_cacheline( + &parent[index % EUFS_FILE_TREE_DEGREE]); + eufs_pbarrier(); + + /* + * It is important to persist all preivous alllocations + * here. Otherwise, the xmem might be freed before its + * information is handled in the page_batch, which will + * cause xmem being marked as allocated (page_batch does + * this) when it is in the free list. + * xfstests/generic/299 can trigger this. + */ + eufs_alloc_batch_persist_reset( + inode->i_sb, &EUFS_I(inode)->page_batch); + nv_free_rest(inode->i_sb, xmem); + } + + eufs_dbg( + "! file writing to pos=%ld xmem=%px, offset=%ld, buf=%px, bytes=%ld index=%ld, copied=%ld\n", + (long)pos, xmem, (long)offset, buf, (long)bytes, + (long)index, (long)copied); + + if (likely(copied > 0)) { + written += copied; + count -= copied; + pos += copied; + buf += copied; + } + + if (unlikely(copied != bytes)) { + status = -EFAULT; + break; + } + } while (count); + if (ppos) + *ppos = pos; + eufs_dbg("pos: %d inode->i_size: %d written: %d\n", (int)pos, + (int)inode->i_size, (int)written); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (!keep && pos > inode->i_size) + eufs_inode_size_write(inode, pos); + + return written ? written : status; +} + +ssize_t __eufs_file_write(struct address_space *mapping, + const char __user *buf, size_t count, loff_t pos, + loff_t *ppos, bool zero, bool keep) +{ + return __eufs_file_write_inode(mapping->host, buf, count, pos, ppos, + zero, keep); +} + +ssize_t eufs_file_write(struct file *filp, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + struct eufs_inode_info *vi = EUFS_I(inode); + struct super_block *sb = inode->i_sb; + ssize_t written = 0; + loff_t pos; + size_t count, ret; + bool osync = false; + + inode_lock(inode); + + if (!access_ok(buf, len)) { + ret = -EFAULT; + goto out; + } + + if (filp->f_flags & O_APPEND) + pos = inode->i_size; + else + pos = *ppos; + + if (filp->f_flags & __O_SYNC) + osync = true; + + count = len; + if (count == 0) { + ret = 0; + goto out; + } + + NV_ASSERT(sb->s_blocksize == PAGE_SIZE); + + ret = file_remove_privs(filp); + if (ret) + goto out; + + inode->i_ctime = inode->i_mtime = current_time(inode); + + /* + * It's a little tricky here. We should use mmap_rwsem to protect + * the block allocation and i_size update, but mmap_rwsem can not + * be taken during block writing because that will lead to + * dead-lock. We only use mmap_rwsem to protect the block allocation, + * and there are two reasons we can do that: + * 1. mmap fault takes the mmap_rwsem before read i_size, so it + * can not read the updated i_size before the allocation is done. + * 2. write only extends the block tree, and will not remove or + * modify the existed block mappings. + */ + down_write(&vi->mmap_rwsem); + /* + * Possible cases for writing [pos~pos+len) + * + * Definitions + * EOF: the byte after last valid byte + * EOF-page: page contains EOF + * first: the page pos belongs to + * last: the page pos+len belongs to + * Macro EOP(p): the last byte of p's page + * + * IMPORTANT NOTICE: we do not guarantee that [EOF~EOP(EOF)] are + * zeroed! When we mmap a file, we will erase that (in DRAM) in the + * mmap syscall. This can concurrently happen with a write syscall + * which may cause consistency problems (especially when it's an + * append). Concurrent mmap-access and read-/write-access should be + * protected by the application. + * + * 1) EOF-page | first | last + * area-to-zero: [EOF~EOP(EOF)] + * 2) EOF-page=first| last + * area-to-zero: [EOF~pos) if EOF<pos + * 3) first | EOF-page | last + * area-to-zero: none + * 4) first | EOF-page=last + * area-to-zero: none + * 5) first | last | EOF-page + * area-to-zero: + * And for ALL cases, if first/last page is a hole, we need to zero + * the part that will not be written in this write. + */ + + /* don't zero-out the allocated blocks */ + ret = eufs_alloc_blocks_btree_for_write(inode, pos, count); + if (IS_ERR_VALUE(ret)) { + up_write(&vi->mmap_rwsem); + goto out; + } + + /* If we decide to guarantee zeroed file tail, we may use this snippet. */ + /* zeroing part of the last block goes beyond the new EOF */ + if (PAGE_ALIGN(pos + count) > PAGE_ALIGN(inode->i_size)) + eufs_inode_zero_range(inode, pos + count, + PAGE_ALIGN(pos + count)); + + /* + * zeroing the hole created by write. + * part of the hole is included in the last page that exceeds EOF, + * and it has already been zeroed, so only zeroing the remaining part. + */ + if (pos > inode->i_size) { + loff_t offset = inode->i_size & (PAGE_SIZE - 1); + + if (offset || !inode->i_size) { + /* + * Zero EOF~EOP(EOF). + * This also satisfies case 2), since [EOP(EOF)+1~pos] + * are holes. + */ + eufs_inode_zero_range_len(inode, inode->i_size, + PAGE_SIZE - offset); + } + } + up_write(&vi->mmap_rwsem); + + written = __eufs_file_write(mapping, buf, count, pos, ppos, false, + false); + if (written < 0 || written != count) { + eufs_dbg("write incomplete/failed: written %ld len %ld pos %llx\n", + written, count, pos); + } + if (osync) { + eufs_alloc_batch_persist_reset(sb, &EUFS_I(inode)->page_batch); + eufs_sync_pinode(inode, EUFS_PI(inode), false); + } else { + request_persistence(inode); + } + + ret = written; + +out: + inode_unlock(inode); + return ret; +} + +static int eufs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *src) +{ + struct super_block *sb = inode->i_sb; + struct eufs_sb_info *sbi = sb->s_fs_info; + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + bool new = false; + void *__pmem xmem; + __le64 *__pmem parent; + + eufs_dbg("fault: inode=%px addr=0x%llx rw=%d length=%lld\n", inode, + offset, flags & IOMAP_WRITE, length); + inode_leaf_lock(inode); + xmem = eufs_find_data_block_btree(inode, first_block, &parent); + /* allocate a new block for write */ + if (xmem == NULL_ADDR_PTR && (flags & IOMAP_WRITE)) { + int ofs = first_block & (EUFS_FILE_TREE_DEGREE - 1); + + /* + * We cannot use normal allocation here because they can send + * IPI to gather pages and blocks. So here we need to use + * non-blocking version, which uses reserved pages instead of + * gathering pages by IPI. + */ + xmem = eufs_zalloc_file_data(inode->i_sb); + if (!xmem) { + inode_leaf_unlock(inode); + return -ENOSPC; + } + + eufs_alloc_persist(inode->i_sb, xmem, false); + /* + * the first block is preallocated during inode initialization, + * so parent should not be NULL when xmem is NULL_ADDR + */ + BUG_ON(!parent); + eufs_flush_page(xmem); + parent[ofs] = p2s(sb, xmem); + eufs_flush_cacheline(&parent[ofs]); + + new = true; + } + inode_leaf_unlock(inode); + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64)first_block << blkbits; + iomap->dax_dev = sbi->s_dax_dev; + iomap->length = 1 << blkbits; + + if (xmem == NULL_ADDR_PTR) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (xmem - sbi->virt_addr); + } + + if (new) + iomap->flags |= IOMAP_F_NEW; + + return 0; +} + +static int eufs_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + return 0; +} + +const struct iomap_ops eufs_iomap_ops = { + .iomap_begin = eufs_iomap_begin, + .iomap_end = eufs_iomap_end, +}; + +static unsigned int eufs_dax_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct eufs_inode_info *vi = EUFS_I(inode); + int ret; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + + /* + * i_size and the block tree must be consistent during mmap fault, + * else eulerfs may map to a freed block or a hole instead of an + * allocated block. + * + * Now i_rwsem is used to protect against the update of i_size and + * the block tree, but it can NOT been used in mmap fault path, + * because mmap fault may be triggered in the middle of + * write or read operation when the dst or src buffer is a mapped + * range of the same file, and that will lead to dead-lock due to + * two acquisitions of the same lock (i_rwsem). + * + * So mmap_rwsem is provided. The read-lock will be used in mmap + * fault path, and the write-lock will be used in truncate & + * fallocate & write paths. + */ + down_read(&vi->mmap_rwsem); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ret, &eufs_iomap_ops); + up_read(&vi->mmap_rwsem); + + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(inode->i_sb); + + return ret; +} + +const struct vm_operations_struct eufs_file_vm_ops = { + .fault = eufs_dax_fault, + .page_mkwrite = eufs_dax_fault, + .pfn_mkwrite = eufs_dax_fault, +}; + +int eufs_dax_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &eufs_file_vm_ops; + eufs_dbg("dax file mmaped!\n"); + return 0; +} + +static loff_t eufs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t maxbytes = inode->i_sb->s_maxbytes; + pgoff_t pgofs; + loff_t data_ofs = offset, isize; + __le64 *parent; + void *addr; + unsigned int ofs; + + inode_lock(inode); + + isize = i_size_read(inode); + + if (offset >= isize) + goto fail; + + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); + + if (EUFS_I(inode)->hole_at_sta && pgofs == 0) { + if (whence == SEEK_HOLE) + goto found; + pgofs++; + data_ofs = (loff_t)pgofs << PAGE_SHIFT; + } + + while (data_ofs < isize) { + addr = eufs_find_data_block_btree(inode, pgofs, &parent); + ofs = pgofs & (EUFS_FILE_TREE_DEGREE - 1); + while (ofs < EUFS_FILE_TREE_DEGREE && data_ofs < isize) { + if (parent) + addr = s2p(inode->i_sb, parent[ofs]); + if (addr == NULL_ADDR_PTR && whence == SEEK_HOLE) + goto found; + if (addr && addr != NULL_ADDR_PTR && + whence == SEEK_DATA) + goto found; + ofs++; + pgofs++; + data_ofs = (loff_t)pgofs << PAGE_SHIFT; + } + } + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + inode_unlock(inode); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + inode_unlock(inode); + return -ENXIO; +} + +loff_t eufs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, maxbytes, + i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + if (offset < 0) + return -ENXIO; + return eufs_seek_block(file, offset, whence); + } + return -EINVAL; +} diff --git a/fs/eulerfs/dax.h b/fs/eulerfs/dax.h new file mode 100644 index 000000000000..fe129191adc3 --- /dev/null +++ b/fs/eulerfs/dax.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DAX_H +#define EUFS_DAX_H + +#include "euler.h" + +#define EUFS_FILE_BCNT_WITH_HEIGHT(h) \ + (1ULL << ((h)*EUFS_FILE_TREE_DEGREE_SHIFT)) +#define EUFS_PTR_CNT_SHIFT_PER_CACHELINE 3 + +#define EUFS_H2_INDEX_IN_L0(bidx) ((bidx) >> EUFS_FILE_TREE_DEGREE_SHIFT) +#define EUFS_H2_INDEX_IN_L1(bidx) ((bidx) & (EUFS_FILE_TREE_DEGREE - 1)) +#define EUFS_H2_IS_FREE_L1_SUBTREE(idx0, bcnt) \ + (((idx0) << EUFS_FILE_TREE_DEGREE_SHIFT) >= (bcnt)) + +#define EUFS_H3_INDEX_IN_L0(bidx) \ + ((bidx) >> (EUFS_FILE_TREE_DEGREE_SHIFT * 2)) +/* (bidx - (idx0 << (SHIFT * 2))) >> SHIFT */ +#define EUFS_H3_INDEX_IN_L1(idx0, bidx) \ + (((bidx) >> EUFS_FILE_TREE_DEGREE_SHIFT) - \ + ((idx0) << EUFS_FILE_TREE_DEGREE_SHIFT)) +#define EUFS_H3_INDEX_IN_L2(bidx) ((bidx) & (EUFS_FILE_TREE_DEGREE - 1)) + +#define EUFS_H3_IS_FREE_L2_SUBTREE(idx0, idx1, bcnt) \ + ((((idx0) << (EUFS_FILE_TREE_DEGREE_SHIFT * 2)) + \ + ((idx1) << EUFS_FILE_TREE_DEGREE_SHIFT)) >= (bcnt)) +#define EUFS_H3_IS_FREE_L1_SUBTREE(idx0, bcnt) \ + (((idx0) << (EUFS_FILE_TREE_DEGREE_SHIFT * 2)) >= (bcnt)) + +int eufs_alloc_blocks_btree(struct inode *inode, unsigned long start_block, + unsigned long num_blocks, int zero); +ssize_t eufs_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *ppos); +ssize_t eufs_file_write(struct file *filp, const char __user *buf, + size_t len, loff_t *ppos); +int eufs_dax_file_mmap(struct file *file, struct vm_area_struct *vma); + +int eufs_extend_btree(struct inode *inode, unsigned long num_blocks); + +int eufs_shrink_btree(struct inode *inode); + +static __always_inline u64 encode_root(u64 off, u64 height) +{ + return (off & ((1UL << 56) - 1)) | (height << 56); +} + +static __always_inline u64 root_ptr(u64 encoded_root) +{ + return (u64)(encoded_root & ((0x1UL << 56) - 1)); +} +static __always_inline int root_height(u64 ptr) +{ + return ((u64)ptr >> 56) & 0xff; +} + +int eufs_free_btree(struct super_block *sb, void *root, int height, + u64 blocks); +int eufs_persist_btree(struct super_block *sb, void *root, int height, + u64 old_size, u64 new_size); + +ssize_t __eufs_file_write(struct address_space *mapping, + const char __user *buf, size_t count, loff_t pos, + loff_t *ppos, bool zero, bool keep); +ssize_t __eufs_file_write_inode(struct inode *inode, + const char __user *buf, size_t count, + loff_t pos, loff_t *ppos, bool zero, + bool keep); + +loff_t eufs_file_llseek(struct file *file, loff_t offset, int whence); + +/* zeroing range [pos, end) */ +static inline void eufs_inode_zero_range(struct inode *inode, loff_t pos, + loff_t end) +{ + if (pos == end) + return; + __eufs_file_write_inode(inode, NULL, end - pos, pos, NULL, true, + true); +} +/* zeroing range [pos, end) */ +static inline void eufs_inode_zero_range_len(struct inode *inode, loff_t pos, + size_t len) +{ + if (!len) + return; + __eufs_file_write_inode(inode, NULL, len, pos, NULL, true, true); +} + +#endif /* EUFS_DAX_H */
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement file operations and inode operations for regular file
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/file.c | 294 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 fs/eulerfs/file.c
diff --git a/fs/eulerfs/file.c b/fs/eulerfs/file.c new file mode 100644 index 000000000000..d5a743c102e3 --- /dev/null +++ b/fs/eulerfs/file.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/falloc.h> +#include <asm/mman.h> +#include "euler.h" +#include "dax.h" +#include "dep.h" + +static long eufs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct eufs_inode_info *vi = EUFS_I(inode); + loff_t end = offset + len; + unsigned long start_page = offset >> PAGE_SHIFT, + end_page = DIV_ROUND_UP(end, PAGE_SIZE); + bool zero = mode & FALLOC_FL_ZERO_RANGE; + bool keep = mode & FALLOC_FL_KEEP_SIZE; + long r = 0; + + if (mode & (FALLOC_FL_INSERT_RANGE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE)) + return -EOPNOTSUPP; + + /* Fix xfstests 228 */ + r = inode_newsize_ok(inode, len + offset); + if (r) + return r; + + inode_lock(inode); + + down_write(&vi->mmap_rwsem); + + r = eufs_alloc_blocks_btree( + inode, start_page, end_page - start_page, + (offset < inode->i_size) ? + /* Zero if overwriting */ + EUFS_ALLOC_BLOCKS_ZERO_ALL : + (((offset >> PAGE_SHIFT) <= + (inode->i_size >> PAGE_SHIFT) && + (offset > inode->i_size)) ? + EUFS_ALLOC_BLOCKS_ZERO_EDGE : + EUFS_ALLOC_BLOCKS_ZERO_NONE)); + if (IS_ERR_VALUE(r)) + goto out; + + eufs_dbg( + "fallocate (f=%px, fsize=%llx, offset=%llx, len=%llx, zero=%d, keep=%d)\n", + file, inode->i_size, offset, len, zero, keep); + if (offset + len >= inode->i_size) { + if (!zero) { + /* zero inode->i_size ~> offset + len */ + __eufs_file_write(file->f_mapping, NULL, + offset + len - inode->i_size, + inode->i_size, NULL, true, keep); + + /* zero part of the last block goes beyond the new EOF */ + eufs_inode_zero_range(inode, offset + len, + PAGE_ALIGN(offset + len)); + } else if (offset >= inode->i_size) { + eufs_dbg("zero(f=%px): %llx to %llx\n", file, + inode->i_size, offset); + /* zero inode->i_size ~> offset */ + __eufs_file_write(file->f_mapping, NULL, + offset - inode->i_size, + inode->i_size, NULL, true, keep); + } + + if (!keep) + inode->i_size = offset + len; + } + + if (zero) { + /* zero offset ~> offset + len */ + __eufs_file_write(file->f_mapping, NULL, len, offset, NULL, + true, keep); + + /* zero part of the last block goes beyond the new EOF */ + if (offset + len >= inode->i_size) + eufs_inode_zero_range(inode, offset + len, + PAGE_ALIGN(offset + len)); + } + + request_persistence(inode); + +out: + up_write(&vi->mmap_rwsem); + inode_unlock(inode); + + return r; +} + +static void eufs_dir_fsync_until_seq(struct inode *dir, u32 dep_seq) +{ + struct eufs_inode_info *vinode = EUFS_I(dir); + + if (eufs_dep_seq_after_eq(vinode->i_persisted_dep_seq, dep_seq)) + return; + + inode_lock(dir); + eufs_dir_fsync_oneshot(dir); + inode_unlock(dir); +} + +static void eufs_persist_dentries(struct inode *inode) +{ + struct eufs_inode_info *vi = EUFS_I(inode); + struct list_head *head = &vi->i_owner_list; + + if (list_empty(head)) + return; + + spin_lock(&vi->i_owner_lock); + while (!list_empty(head)) { + struct dep_node *dep; + struct inode *dir; + u32 seq; + + dep = list_first_entry(head, struct dep_node, owner_node); + dir = dep->dir; + seq = dep->seq; + + /* let it be deleted by dir persistence ? */ + list_del_init(&dep->owner_node); + spin_unlock(&vi->i_owner_lock); + + eufs_dir_fsync_until_seq(dir, seq); + + spin_lock(&vi->i_owner_lock); + } + spin_unlock(&vi->i_owner_lock); +} + +static void eufs_persist_parent_dentry_till_root(struct dentry *child) +{ + struct dentry *cur = child; + + while (!IS_ROOT(cur)) { + struct dentry *parent; + + parent = cur->d_parent; + eufs_persist_dentries(parent->d_inode); + cur = parent; + } +} + +static int eufs_persist_parent_dentries_till_root(struct file *filp) +{ + struct inode *inode = filp->f_inode; + unsigned int nlink = inode->i_nlink; + struct dentry *alias; + struct dentry **aliases; + unsigned int cnt; + unsigned int idx; + + if (nlink == 0) + return 0; + + if (nlink == 1) { + eufs_persist_parent_dentry_till_root(filp->f_path.dentry); + return 0; + } + + aliases = kmalloc(nlink * sizeof(*aliases), GFP_KERNEL); + if (aliases == NULL) + return -ENOMEM; + + cnt = 0; + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + dget(alias); + aliases[cnt++] = alias; + + if (cnt >= nlink) + break; + } + spin_unlock(&inode->i_lock); + + for (idx = 0; idx < cnt; idx++) { + eufs_persist_parent_dentry_till_root(aliases[idx]); + dput(aliases[idx]); + } + + kfree(aliases); + + return 0; +} + +/* + * inconsistency: + * + * non-dir: + * data -> inode & name -> dentries... -> parent dentries... + * dentry [parent data -> parent inode] + * (1) link can be high: new A/1, link B/2 (A/1), fsync A/1 + * (2) link can be low: new A/1, link B/2 (A/1), fsync A/1, rm A/1, + * fsync A/1 + * + * dir: + * children inodes -> data -> inode & name + * (1) link & size can be low: new A/1..3, fsync A + * (2) link & size can be high: new A/1..3, fsync A, rm A/1, fsync A + */ +/* This function is called by both msync() and fsync(). */ +int eufs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + /* Sync from start to end[inclusive] */ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct eufs_inode_info *vi = EUFS_I(inode); + umode_t mode; + int err; + bool inode_is_dirty; + + mode = inode->i_mode; + /* persist file data written through mmap */ + if (S_ISREG(mode)) { + err = filemap_write_and_wait_range(inode->i_mapping, start, + end); + if (err) + return err; + } + + /* persist its data and inode first */ + inode_is_dirty = false; + inode_lock(inode); + if (vi->i_is_dirty) { + inode_is_dirty = true; + } else { + /* + * modifications of mtime/ctime/atime has not been tracked + * by persister yet, so check it here + */ + spin_lock(&inode->i_lock); + if (inode->i_state & I_DIRTY_SYNC) { + inode->i_state &= ~I_DIRTY_SYNC; + inode_is_dirty = true; + } + spin_unlock(&inode->i_lock); + } + if (inode_is_dirty) + fsync_oneshot(inode); + inode_unlock(inode); + + /* + * persist dentries related with the inode. If it is non-dir, + * there may be multiple dentries related with it (namely hard-link). + */ + eufs_persist_dentries(inode); + + /* + * persist parent dentries and recurse upward until the root dentry is reached. + * For non-dir, there may be multiple parent dentries due to hard-link. + */ + if (!S_ISDIR(mode)) + err = eufs_persist_parent_dentries_till_root(file); + else + eufs_persist_parent_dentry_till_root(file->f_path.dentry); + + return err; +} + +const struct file_operations eufs_file_operations = { + .llseek = eufs_file_llseek, + .read = eufs_file_read, + .write = eufs_file_write, + .mmap = eufs_dax_file_mmap, + .open = generic_file_open, + .fsync = eufs_fsync, + .flush = NULL, + .fallocate = eufs_fallocate, +}; + +const struct inode_operations eufs_file_inode_operations = { + .setattr = eufs_notify_change, + .getattr = eufs_file_getattr, +};
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Impelement inode_operations for dir inode and special inode.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/namei.c | 872 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 872 insertions(+) create mode 100644 fs/eulerfs/namei.c
diff --git a/fs/eulerfs/namei.c b/fs/eulerfs/namei.c new file mode 100644 index 000000000000..e4c6c36575f2 --- /dev/null +++ b/fs/eulerfs/namei.c @@ -0,0 +1,872 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/crc32c.h> +#include "euler.h" +#include "dht.h" +#include "dep.h" +#include "lock.h" + +/* + * If sbi->s_draining is set, do fsync after each namei syscall! This is much + * better than lock transfer for volatility quota. + */ +static void sync_on_draining(struct inode *dir, struct inode *inode) +{ + struct eufs_sb_info *sbi = EUFS_SB(dir->i_sb); + + if (likely(!sbi->s_draining)) + return; + + /* fsync the inodes to reduce the number of dirty inodes */ + fsync_on_draining(dir, inode); +} + +static __always_inline void +eufs_trace_newfile(const char *prompt, struct inode *dir, struct inode *inode, + struct eufs_inode *pi, struct nv_dict_entry *de) +{ + eufs_dbg("%s (%s): inode=%px pi=%px pi->root=%llx pi->mode=0%o de=%px de->len=%lld de->name=%6s de->nextname=%llx inode->nlink=%d pi->nlink=%d de->volatile_next=%llx de->next=%llx\n", + __func__, prompt, inode, pi, eufs_iread_root(pi), + eufs_iread_mode(pi), de, HASHLEN_LEN(de->hv), de->name, + de->nextname, inode->i_nlink, pi->i_nlink, de->volatile_next, + de->next); + + BUG_ON(inode->i_mode != pi->i_mode); +} + +static __always_inline void eufs_trace_delfile(const char *prompt, + struct inode *dir, + struct inode *inode, + struct eufs_inode *pi) +{ + eufs_dbg("%s (%s): inode=%px pi=%px pi->root=%llx pi->mode=0%o inode->i_nlink=%d pi->i_nlink=%d\n", + __func__, prompt, inode, pi, eufs_iread_root(pi), + eufs_iread_mode(pi), inode->i_nlink, eufs_iread_nlink(pi)); + /* + * because inode is locked by unlink/link, so the increment/decrement + * of nlink should be in order and its max value is (EUFS_LINK_MAX - 1) + * after unlink. + */ + if ((inode->i_mode & S_IFMT) != S_IFDIR) + WARN(inode->i_nlink >= EUFS_LINK_MAX, + "unexpected nlink %d for inode 0x%lx\n", inode->i_nlink, + inode->i_ino); +} + +static __always_inline struct nv_dict_entry * +nv_dict_add_wrapper(struct inode *dir, u64 **nv_header, struct eufs_inode *pi, + hashlen_t hv, const char *name) +{ + struct eufs_inode_info *vi = EUFS_I(dir); + + NV_ASSERT(pi); + if (!vi->i_volatile_dict) + vi->i_volatile_dict = eufs_zalloc_page(); + + /* insert into parent dir hash table */ + return nv_dict_add(dir, nv_header, hv, name, pi); +} + +static __always_inline struct nv_dict_entry * +nv_dict_del_wrapper(struct inode *dir, struct nv_dict_entry **prevde, + u64 **nv_header, hashlen_t hv, const char *name) +{ + struct eufs_inode_info *vi = EUFS_I(dir); + /* Alloc for dict if necessary */ + if (!vi->i_volatile_dict) + vi->i_volatile_dict = eufs_zalloc_page(); + + /* insert into parent dir hash table */ + return nv_dict_delete(dir, prevde, nv_header, hv, name); +} + +/* + * Methods themselves. + */ +static struct dentry *eufs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct nv_dict_entry *de; + const char *name; + u64 hv; + + if (dentry->d_name.len > EUFS_MAX_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + hv = hash(dentry->d_name.name, dentry->d_name.len); + name = dentry->d_name.name; + de = nv_dict_find(dir, hv, name); + if (!de) + goto not_found; + + inode = eufs_iget(dir->i_sb, s2p(dir->i_sb, de->inode)); + if (inode == ERR_PTR(-ESTALE)) { + eufs_err(dir->i_sb, "deleted inode referenced: 0x%lx", + inode->i_ino); + return ERR_PTR(-EIO); + } +not_found: + + if (inode) + BUG_ON(atomic_read(&inode->i_count) < 1); + return d_splice_alias(inode, dentry); +} + +static int add_pinode(struct inode *dir, struct dentry *dentry, + struct inode *inode, bool need_unlock_inode) +{ + /* Name must be checked before this is invoked. */ + struct eufs_inode_info *dir_vi = EUFS_I(dir); + struct eufs_inode *pi; + const char *name; + struct nv_dict_entry *de; + u64 *nv_header; + u64 hv; + struct dep_node *dep; + int err; + + dep = eufs_alloc_dep_node(); + if (!dep) + return -ENOMEM; + + if (need_unlock_inode) + eufs_inode_mark_lock_transferable(inode); + + /* Add to dict */ + pi = EUFS_PI(inode); + name = dentry->d_name.name; + hv = hash(name, dentry->d_name.len); + de = nv_dict_add_wrapper(dir, &nv_header, pi, hv, name); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto err_out; + } + + /* One more dentry */ + dir->i_size++; + eufs_dbg("diradd +> %lld of %px 0x%lx\n", dir->i_size, dir, dir->i_ino); + + /* Update dir time */ + dir->i_ctime = dir->i_mtime = current_time(dir); + + dep_new_insert(dep, dir, DEP_DIRADD, NULL, nv_header, de, inode, + dir_vi->i_next_dep_seq); + + if (need_unlock_inode) + eufs_inode_wait_lock_transfer_done(inode); + + dir_vi->i_next_dep_seq++; + + return 0; + +err_out: + if (need_unlock_inode) + eufs_inode_wait_lock_transfer_done(inode); + eufs_free_dep_node(dep); + return err; +} + +static __always_inline int del_pinode(struct inode *dir, struct dentry *dentry, + bool is_dir) +{ + struct eufs_inode_info *dir_vi = EUFS_I(dir); + struct inode *inode = dentry->d_inode; + struct nv_dict_entry *de, *prevde; + u64 *nv_header; + const char *name; + u64 hv; + struct dep_node *dep; + struct eufs_inode *pi; + int err; + + dep = eufs_alloc_dep_node(); + if (!dep) + return -ENOMEM; + + eufs_inode_mark_lock_transferable(inode); + + /* Remove from parent dir hash table */ + name = dentry->d_name.name; + hv = hash(name, dentry->d_name.len); + de = nv_dict_del_wrapper(dir, &prevde, &nv_header, hv, name); + if (unlikely(!de)) { + err = -ENOENT; + goto err_out; + } + + /* Drop one dentry */ + dir->i_size--; + eufs_dbg("dirdel -> %lld of %px 0x%lx\n", dir->i_size, dir, dir->i_ino); + + /* Update parent dir time */ + dir->i_ctime = dir->i_mtime = current_time(dir); + + /* Update inode ctime and link */ + inode->i_ctime = dir->i_ctime; + if (is_dir) { + /* Update nlink and ctime for the removed inode */ + WARN_ON(inode->i_nlink != 2); + clear_nlink(inode); + } else if (inode->i_nlink) { + drop_nlink(inode); + } else { + pi = EUFS_PI(inode); + eufs_info("!%s!: inode=%p, inode->i_nlink=%d inode->i_mode=0%o pi=%p pi->i_nlink=%d pi->i_mode=0%o\n", + __func__, inode, inode->i_nlink, inode->i_mode, pi, + eufs_iread_nlink(pi), eufs_iread_mode(pi)); + BUG(); + } + + dep_new_insert(dep, dir, DEP_DIRREM, prevde, nv_header, de, inode, + dir_vi->i_next_dep_seq); + + eufs_inode_wait_lock_transfer_done(inode); + + dir_vi->i_next_dep_seq++; + + return 0; + +err_out: + eufs_inode_wait_lock_transfer_done(inode); + eufs_free_dep_node(dep); + return err; +} + +static void eufs_free_new_inode(struct inode *inode) +{ + clear_nlink(inode); + remove_inode_hash(inode); + unlock_new_inode(inode); + iput(inode); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int eufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + int err; + + /* name checks */ + if (unlikely(!dentry->d_name.len)) + return -EINVAL; + if (unlikely(dentry->d_name.len > EUFS_MAX_NAME_LEN)) + return -ENAMETOOLONG; + + inode = pre_inodes_get(dentry, dir, mode, false, 0); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + err = add_pinode(dir, dentry, inode, false); + if (err) { + eufs_free_new_inode(inode); + return err; + } + + inode->i_op = &eufs_file_inode_operations; + inode->i_mapping->a_ops = &eufs_aops; + inode->i_fop = &eufs_file_operations; + + eufs_trace_newfile("!create!", dir, inode, EUFS_PI(inode), NULL); + + EUFS_I(inode)->i_is_dirty = true; + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + sync_on_draining(dir, NULL); + + return 0; +} + +static int eufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct inode *inode; + int err; + + if (unlikely(!dentry->d_name.len)) + return -EINVAL; + if (unlikely(dentry->d_name.len > EUFS_MAX_NAME_LEN)) + return -ENAMETOOLONG; + + inode = pre_inodes_get(dentry, dir, mode, true, rdev); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + err = add_pinode(dir, dentry, inode, false); + if (err) { + eufs_free_new_inode(inode); + return err; + } + + inode->i_op = &eufs_special_inode_operations; + + eufs_trace_newfile("!mknode!", dir, inode, EUFS_PI(inode), NULL); + + EUFS_I(inode)->i_is_dirty = true; + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + sync_on_draining(dir, NULL); + + return 0; +} + +static int eufs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode = NULL; + struct eufs_inode *pi; + u32 len = strlen(symname); + void *pi_root; + int err; + + /* name checks */ + if (unlikely(!dentry->d_name.len)) + return -EINVAL; + if (unlikely(dentry->d_name.len > EUFS_MAX_NAME_LEN)) + return -ENAMETOOLONG; + if (unlikely(len > EUFS_MAX_SYMLINK_LEN)) + return -ENAMETOOLONG; + + /* alloc vfs inode and xxfs inode */ + inode = pre_inodes_get(dentry, dir, S_IFLNK | S_IRWXUGO, false, 0); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + pi = EUFS_FRESH_PI(EUFS_PI(inode)); + + pi_root = o2p(dir->i_sb, eufs_iread_root(pi)); + + /* copy the symname */ + *((u64 *)pi_root) = hash(symname, len); + memcpy(((char *)pi_root) + sizeof(u64), symname, len); + BUG_ON(!eufs_access_ok(inode->i_sb, pi_root, PAGE_SIZE)); + + /* update the size */ + inode->i_size = len; + + err = add_pinode(dir, dentry, inode, false); + if (err) { + eufs_free_new_inode(inode); + return err; + } + + inode->i_op = &eufs_symlink_inode_operations; + inode->i_mapping->a_ops = &eufs_aops; + + eufs_trace_newfile("!symlink!", dir, inode, pi, NULL); + + EUFS_I(inode)->i_is_dirty = true; + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + sync_on_draining(dir, NULL); + + return 0; +} + +static int eufs_link(struct dentry *dest_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = dest_dentry->d_inode; + struct eufs_inode *pi = EUFS_PI(inode); + struct nv_dict_entry *de; + int err; + + /* name checks */ + if (unlikely(!dentry->d_name.len)) + return -EINVAL; + if (unlikely(dentry->d_name.len > EUFS_MAX_NAME_LEN)) + return -ENAMETOOLONG; + /* nlink check */ + if (unlikely(inode->i_nlink >= EUFS_LINK_MAX)) + return -EMLINK; + + ihold(inode); + + err = add_pinode(dir, dentry, inode, true); + if (unlikely(err)) { + iput(inode); + return err; + } + + /* update inode ctime */ + inode->i_ctime = current_time(inode); + inc_nlink(inode); + + EUFS_I(inode)->i_is_dirty = true; + d_instantiate(dentry, inode); + + eufs_trace_newfile("!link!", dir, inode, pi, de); + + /* inode_lock() has been acquired */ + sync_on_draining(dir, inode); + + return 0; +} + +static int eufs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int ret; + + ret = del_pinode(dir, dentry, false); + if (ret < 0) + return ret; + + eufs_trace_delfile("!unlink!", dir, inode, EUFS_PI(inode)); + + EUFS_I(inode)->i_is_dirty = true; + + sync_on_draining(dir, inode); + + return 0; +} + +/* NOTE: do not count the link for directories */ +static int eufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct eufs_inode *dir_pi = EUFS_PI(dir); + struct eufs_inode_info *vi; + struct eufs_inode *pi; + int err; + + /* name checks */ + if (unlikely(!dentry->d_name.len)) + return -EINVAL; + + if (unlikely(dentry->d_name.len > EUFS_MAX_NAME_LEN)) + return -ENAMETOOLONG; + + /* alloc vfs inode and xxfs inode */ + inode = pre_inodes_get(dentry, dir, S_IFDIR | mode, false, 0); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &eufs_dir_inode_operations; + inode->i_fop = &eufs_dir_operations; + inode->i_mapping->a_ops = &eufs_aops; + /* We have to mimic the nlink number */ + inc_nlink(inode); + + /* alloc & init dir hash table for new inode */ + pi = EUFS_FRESH_PI(EUFS_PI(inode)); + vi = EUFS_I(inode); + vi->i_dotdot = p2o(dir->i_sb, dir_pi); + pi->i_dotdot = cpu_to_le64(vi->i_dotdot); + + err = add_pinode(dir, dentry, inode, false); + if (err) { + eufs_free_new_inode(inode); + return err; + } + + /* We have to mimic the nlink number */ + inc_nlink(dir); + + eufs_trace_newfile("!mkdir!", dir, inode, pi, NULL); + + vi->i_is_dirty = true; + d_instantiate(dentry, inode); + unlock_new_inode(inode); + + sync_on_draining(dir, NULL); + + PRINT_PINODE(pi, "FINAL-CHECK: "); + + BUG_ON(atomic_read(&dir->i_count) < 1); + if (inode) + BUG_ON(atomic_read(&inode->i_count) < 1); + + return 0; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static __always_inline int eufs_empty_dir(struct inode *inode) +{ + return !inode->i_size; +} + +static int eufs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int ret; + + /* checks before rmdir */ + if (!inode) + return -ENOENT; + if (!eufs_empty_dir(inode)) + return -ENOTEMPTY; + + ret = del_pinode(dir, dentry, true); + if (ret < 0) + return ret; + + /* We have to mimic the nlink number */ + drop_nlink(dir); + + EUFS_I(inode)->i_is_dirty = true; + + eufs_trace_delfile("!rmdir!", dir, inode, EUFS_PI(inode)); + + sync_on_draining(dir, inode); + + return 0; +} + +/* + * Precondition: old_dentry exists in the old directory + */ +static int eufs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + /* + * TODO: need to lock old_inode + * If old_inode is a directory, its inode lock will + * not be acquired, so the offset of newest physical node + * may be changed during the rename procedure. + */ + struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(old_inode)); + struct eufs_inode *old_dir_pi; + struct eufs_inode *new_dir_pi; + struct inode *locked_inodes[EUFS_INODE_CNT_IN_RENAME] = { + old_dir, new_dir, old_inode, new_inode + }; + struct super_block *sb = old_inode->i_sb; + + struct nv_dict_entry *new_de; + u64 *new_dir_nv_header; + struct nv_dict_entry *old_de, *old_prevde; + u64 *old_dir_nv_header; + bool in_same_dir = (old_dir == new_dir); + + const char *name; + struct eufs_renamej *renamej; + u64 old_hv, new_hv; + struct nv_dict_entry **vde; + int cpu; + void *buffer[16]; + + NV_ASSERT(pi->i_mode == old_inode->i_mode); + + if (flags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EOPNOTSUPP; + + /* checks */ + if (new_inode) { + eufs_dbg( + "!new_inode=%px new_inode->i_count=%d new_dentry->d_lockref.count=%d\n", + new_inode, atomic_read(&new_inode->i_count), + new_dentry->d_lockref.count); + if (S_ISDIR(new_inode->i_mode) && !eufs_empty_dir(new_inode)) + return -ENOTEMPTY; + eufs_dbg("rename overwrites! newinode=%px newpi=%px newinode->i_mode=0%o, newinode->root=%px oldinode=%px oldpi=%px oldinode->i_mode=0%o, oldinode->root=%px\n", + new_inode, EUFS_PI(new_inode), new_inode->i_mode, + EUFS_I(new_inode)->i_volatile_root, old_inode, + EUFS_PI(old_inode), old_inode->i_mode, + EUFS_I(old_inode)->i_volatile_root); + BUG_ON(new_inode->i_mode != EUFS_PI(new_inode)->i_mode); + } + eufs_dbg("%s: rename %s to %s before fsync, old_pi=%px new_pi=%px\n", + __func__, old_dentry->d_name.name, new_dentry->d_name.name, + EUFS_PI(old_inode), + new_inode ? EUFS_PI(new_inode) : (void *)-1); + + eufs_dbg("old: dir=%px inode=%px; new: dir=%px inode=%px\n", old_dir, + old_inode, new_dir, new_inode); + + if (S_ISDIR(old_inode->i_mode)) + locked_inodes[2] = NULL; + if (locked_inodes[0] == locked_inodes[1]) + locked_inodes[1] = NULL; + + if (locked_inodes[0]) + BUG_ON(!inode_is_locked(locked_inodes[0])); + if (locked_inodes[1]) + BUG_ON(!inode_is_locked(locked_inodes[1])); + if (locked_inodes[2]) + BUG_ON(!inode_is_locked(locked_inodes[2])); + if (locked_inodes[3]) + BUG_ON(!inode_is_locked(locked_inodes[3])); + + fsync_rename_inodes(old_dir, new_dir, locked_inodes); + + /* + * get the newer inodes after fsync_rename_inodes() completes + * which may update the offset of the newer inodes + */ + old_dir_pi = EUFS_FRESH_PI(EUFS_PI(old_dir)); + new_dir_pi = EUFS_FRESH_PI(EUFS_PI(new_dir)); + + /* -------------- get new dentry info -------------- */ + /* get new filename */ + new_hv = hash(new_dentry->d_name.name, new_dentry->d_name.len); + name = new_dentry->d_name.name; + eufs_dbg("%s: rename %s to %s\n", __func__, old_dentry->d_name.name, + new_dentry->d_name.name); + + /* -------------- insertion ---------------- */ + /* insert into parent dir hash table */ + if (new_inode) { + new_de = nv_dict_find(new_dir, new_hv, name); + if (!new_de) + return -ENOENT; + /* Delay the actual write */ + BUG_ON(!new_inode->i_nlink); + ihold(new_inode); + /* We have new_inode in hand */ + if (S_ISDIR(new_inode->i_mode)) { + WARN_ON(new_inode->i_nlink != 2); + clear_nlink(new_inode); + } else { + drop_nlink(new_inode); + } + new_dir_nv_header = NULL; + } else { + new_de = nv_dict_add_wrapper(new_dir, &new_dir_nv_header, + EUFS_HEAD_PI(pi), new_hv, name); + if (IS_ERR(new_de)) + return PTR_ERR(new_de); + if (unlikely(!new_de)) + return -EEXIST; + /* We have no dep in rename. Just release the header lock */ + inode_header_unlock(new_dir); + + if (!in_same_dir) { + new_dir->i_size++; + if (S_ISDIR(old_inode->i_mode)) + inc_nlink(new_dir); + } + eufs_dbg("rename diradd +> %lld of %px 0x%lx\n", + new_dir->i_size, new_dir, new_dir->i_ino); + } + /* update dir time */ + new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + + /* -------------- get old dentry info -------------- */ + /* get old filename */ + old_hv = hash(old_dentry->d_name.name, old_dentry->d_name.len); + name = old_dentry->d_name.name; + if (!name) { + BUG(); + return -ENOENT; + } + + /* -------------- removal ---------------- */ + old_de = nv_dict_del_wrapper(old_dir, &old_prevde, &old_dir_nv_header, + old_hv, name); + if (unlikely(!old_de)) { + BUG(); + return -ENOENT; + } + /* We have no dep in rename. Just release the header lock */ + inode_header_unlock(old_dir); + + if (!in_same_dir || new_inode) { + old_dir->i_size--; + if (S_ISDIR(old_inode->i_mode)) + drop_nlink(old_dir); + } + if (old_dir != new_dir) + old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime; + + eufs_dbg("rename dirdel -> %lld of %px 0x%lx\n", old_dir->i_size, + old_dir, old_dir->i_ino); + + /* old_inode may NOT be locked ? */ + /* update ctime of source inode */ + old_inode->i_ctime = new_dir->i_ctime; + if (!in_same_dir && S_ISDIR(old_inode->i_mode)) { + /* update parent pointer of source inode */ + struct eufs_inode_info *vi = EUFS_I(old_inode); + + vi->i_dotdot = p2o(sb, EUFS_HEAD_PI(new_dir_pi)); + } + + NV_ASSERT(new_de->inode == old_de->inode); + NV_ASSERT(eufs_valid_inode_in_de(old_de, old_inode)); + NV_ASSERT(old_inode->i_mode == + eufs_iread_mode(EUFS_FRESH_PI( + (struct eufs_inode *)(s2p(sb, old_de->inode))))); + + if (!new_inode) { + struct alloc_batch ab; + + ab.n_used = 0; + ab.size = 16; + ab.batch = buffer; + + eufs_alloc_batch_add(old_dir->i_sb, &ab, new_de); + persist_name(old_dir->i_sb, new_de, &ab); + eufs_dentry_clr_not_persist_flag(new_de); + persist_dentry(new_de); + + eufs_alloc_batch_persist_reset(old_dir->i_sb, &ab); + } + + cpu = get_cpu(); + /* RenameJ is redo log */ + renamej = eufs_get_renamej(old_dir->i_sb, cpu); + renamej->crc = 0; + renamej->flags = 0; + /* address to put old_de->next */ + renamej->addr_of_oldnext = + p2s(sb, (old_prevde ? &old_prevde->next : + (void *)old_dir_nv_header)); + /* the value: old_de->next */ + renamej->oldnext = + (old_prevde ? + old_de->next : + old_de->next == EUFS_DIR_EOC ? + NULL_VAL : + COMPOSE_DICT_HEAD_le64(sb, s2p(sb, old_de->next))); + + /* address to put new_de if necessary */ + renamej->addr_of_newde = p2s(sb, new_dir_nv_header); + /* the value: new_de */ + renamej->composed_newde = COMPOSE_DICT_HEAD_le64(sb, new_de); + /* the value: new_de->inode */ + renamej->newde_inode = p2s(sb, EUFS_HEAD_PI(pi)); + + /* dir pi */ + renamej->old_dir_pi = p2s(sb, EUFS_HEAD_PI(old_dir_pi)); + renamej->new_dir_pi = p2s(sb, EUFS_HEAD_PI(new_dir_pi)); + /* inode attributes */ + renamej->time = cpu_to_le64(new_dir->i_ctime.tv_sec); + renamej->time_nsec = cpu_to_le32(new_dir->i_ctime.tv_nsec); + renamej->old_link = cpu_to_le16(old_dir->i_nlink); + renamej->new_link = cpu_to_le16(new_dir->i_nlink); + renamej->old_size = cpu_to_le32(old_dir->i_size); + renamej->new_size = cpu_to_le32(new_dir->i_size); + memset(renamej->pad, 0, sizeof(renamej->pad)); + + renamej->flags = EUFS_RENAME_IN_ACTION; + renamej->crc = cpu_to_le32( + crc32c(EUFS_CRC_SEED, (char *)renamej + sizeof(renamej->crc), + sizeof(*renamej) - sizeof(renamej->crc))); + + eufs_flush_cacheline((char *)renamej + CACHELINE_SIZE); + eufs_flush_cacheline(renamej); + + if (old_prevde) { + old_prevde->next = old_de->next; + } else { + if (old_de->next == EUFS_DIR_EOC) + *old_dir_nv_header = NULL_VAL; + else + *old_dir_nv_header = COMPOSE_DICT_HEAD_le64( + sb, s2p(sb, old_de->next)); + } + eufs_flush_cacheline(old_prevde ? (void *)&old_prevde->next : + (void *)old_dir_nv_header); + + vde = &(EUFS_I(old_dir)->i_volatile_dict->table[INDEX(old_hv)]); + if (*vde) { + bool vbool = (*vde == NULL || *vde == (void *)EUFS_DIR_EOC); + bool pbool = (*old_dir_nv_header == NULL_VAL || + *old_dir_nv_header == EUFS_DIR_EOC); + BUG_ON(vbool != pbool); + *vde = NULL; + } + + if (new_inode) { + new_de->inode = p2s(sb, EUFS_HEAD_PI(pi)); + eufs_flush_cacheline(new_de); + } else { + *new_dir_nv_header = COMPOSE_DICT_HEAD_le64(sb, new_de); + eufs_flush_cacheline(new_dir_nv_header); + vde = &EUFS_I(new_dir)->i_volatile_dict->table[INDEX(new_hv)]; + if (*vde) { + bool vbool = + (*vde == NULL || *vde == (void *)EUFS_DIR_EOC); + bool pbool = (*new_dir_nv_header == NULL_VAL || + *new_dir_nv_header == EUFS_DIR_EOC); + BUG_ON(vbool != pbool); + *vde = NULL; + } + } + + eufs_iwrite_size(old_dir_pi, old_dir->i_size); + eufs_iwrite_nlink(old_dir_pi, old_dir->i_nlink); + eufs_iwrite_ctime_mtime(old_dir_pi, old_dir); + eufs_flush_pi(old_dir_pi); + + if (old_dir != new_dir) { + eufs_iwrite_size(new_dir_pi, new_dir->i_size); + eufs_iwrite_nlink(new_dir_pi, new_dir->i_nlink); + eufs_iwrite_ctime_mtime(new_dir_pi, new_dir); + eufs_flush_pi(new_dir_pi); + } + + eufs_iwrite_ctime(pi, old_inode->i_ctime.tv_sec); + eufs_iwrite_ctime_nsec(pi, old_inode->i_ctime.tv_nsec); + if (!in_same_dir && S_ISDIR(old_inode->i_mode)) { + struct eufs_inode_info *vi = EUFS_I(old_inode); + + eufs_iwrite_dotdot(pi, vi->i_dotdot); + } + eufs_flush_pi(pi); + + renamej->flags = 0; + eufs_flush_cacheline(renamej); + put_cpu(); + + /* remove overwritten inode */ + if (new_inode) + iput(new_inode); + + /* remove the source dentry */ + eufs_free_name(old_dir->i_sb, old_de); + nv_free(old_dir->i_sb, old_de); + + eufs_dbg("%s: renamed %s to %s , old_pi=%llx new_pi=%llx\n", __func__, + old_dentry->d_name.name, new_dentry->d_name.name, + old_de->inode, new_de->inode); + + return 0; +} + +const struct inode_operations eufs_dir_inode_operations = { + .create = eufs_create, + .lookup = eufs_lookup, + .link = eufs_link, + .unlink = eufs_unlink, + .symlink = eufs_symlink, + .mkdir = eufs_mkdir, + .rmdir = eufs_rmdir, + .mknod = eufs_mknod, + .rename = eufs_rename, + .setattr = eufs_notify_change, +}; + +const struct inode_operations eufs_special_inode_operations = { + .setattr = eufs_notify_change, +};
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Impelement file_operations for dir inode.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhikang Zhang zhangzhikang1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/dir.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 fs/eulerfs/dir.c
diff --git a/fs/eulerfs/dir.c b/fs/eulerfs/dir.c new file mode 100644 index 000000000000..24a47a3c187e --- /dev/null +++ b/fs/eulerfs/dir.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include "euler.h" +#include "dht.h" + +#define DT2IF(dt) (((dt) << 12) & S_IFMT) +#define IF2DT(sif) (((sif)&S_IFMT) >> 12) + +static int dir_emitter(void *privdata, const struct nv_dict_entry *de) +{ + struct dir_scan_data *data = (struct dir_scan_data *)privdata; + struct eufs_inode *pi; + int namelen; + const char *name; + char *page; + int r; + + pi = s2p(data->sb, de->inode); + pi = EUFS_FRESH_PI(pi); + name = de->name; + namelen = HASHLEN_LEN(de->hv); + + eufs_dbg("!de=%px, de->pi=%px, de->nextname=%llx, namelen=%d\n", de, + pi, de->nextname, namelen); + if (likely(namelen <= FIRST_LEN)) { + eufs_dbg("%s found name: %*s len: %d inode: %px\n", + __func__, namelen, name, namelen, pi); + + r = dir_emit(data->ctx, name, namelen, le64_to_cpu(de->inode), + IF2DT(eufs_iread_mode(pi))); + if (!r) + return -EINVAL; + return 0; + } + if (eufs_ptr_fast_check_b(de->nextname)) { + eufs_info("!de=%px, de->pi=%px, de->nextname=%llx, namelen=%d\n", + de, pi, de->nextname, namelen); + BUG(); + } + page = eufs_alloc_name_copy(data->sb, name, namelen, + s2p(data->sb, de->nextname)); + eufs_dbg("%s found name: %*s len: %d inode: %px\n", __func__, namelen, + page, namelen, pi); + + r = dir_emit(data->ctx, page, namelen, le64_to_cpu(de->inode), + IF2DT(eufs_iread_mode(pi))); + eufs_free_page(page); + if (!r) + return -EINVAL; + return 0; +} + +static int eufs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct eufs_inode *pi = EUFS_PI(inode); + struct dir_scan_data data = { .sb = inode->i_sb, .ctx = ctx }; + + if (ctx->pos == EUFS_DIR_EODIR) + return 0; + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, (u64)eufs_pi2ino(inode->i_sb, pi), + IF2DT(inode->i_mode))) { + return -EINVAL; + } + ctx->pos = EUFS_DIR_DOT; + } + + if (ctx->pos == EUFS_DIR_DOT) { + struct eufs_inode *dotdot = o2p( + inode->i_sb, eufs_iread_dotdot(EUFS_FRESH_PI(pi))); + + if (!dir_emit(ctx, "..", 2, + (u64)eufs_pi2ino(inode->i_sb, dotdot), + IF2DT(eufs_iread_mode(dotdot)))) + return -EINVAL; + ctx->pos = EUFS_DIR_DOTDOT; + } + + if (!inode->i_size) { + ctx->pos = EUFS_DIR_EODIR; + return 0; + } + eufs_dbg("In Readdir! ctx->pos=%llx inode=%px, pi=%px\n", ctx->pos, + inode, pi); + + nv_dict_scan_via_ptr(inode, ctx->pos, dir_emitter, (void *)&data); + + eufs_dbg("Out Readdir! ctx->pos=%llx\n", ctx->pos); + return 0; +} + +static loff_t eufs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t retval; + + inode_lock(inode); + switch (whence) { + case SEEK_END: + /* TODO */ + retval = -EINVAL; + goto out; + case SEEK_CUR: + /* TODO */ + retval = -EINVAL; + goto out; + case SEEK_SET: + break; + } + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + retval = offset; +out: + inode_unlock(inode); + return retval; +} + +const struct file_operations eufs_dir_operations = { + .llseek = eufs_dir_llseek, + .read = generic_read_dir, + .iterate = eufs_readdir, + .fsync = eufs_fsync, +};
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement get_link, setattr and listxattr interfaces for symlink inode operations.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/symlink.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 fs/eulerfs/symlink.c
diff --git a/fs/eulerfs/symlink.c b/fs/eulerfs/symlink.c new file mode 100644 index 000000000000..831c5c8758ea --- /dev/null +++ b/fs/eulerfs/symlink.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include "euler.h" + +static const char *eufs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *call) +{ + struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode)); + + return ((char *)o2p(inode->i_sb, eufs_iread_root(pi))) + sizeof(u64); +} + +const struct inode_operations eufs_symlink_inode_operations = { + .get_link = eufs_get_link, + .setattr = eufs_notify_change, +};
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement super_operations and module_init/exit interfaces.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/super.c | 811 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 811 insertions(+) create mode 100644 fs/eulerfs/super.c
diff --git a/fs/eulerfs/super.c b/fs/eulerfs/super.c new file mode 100644 index 000000000000..43fc717002d7 --- /dev/null +++ b/fs/eulerfs/super.c @@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/parser.h> +#include <linux/vfs.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/mm.h> +#include <linux/ctype.h> +#include <linux/bitops.h> +#include <linux/magic.h> +#include <linux/exportfs.h> +#include <linux/random.h> +#include <linux/cred.h> +#include <linux/backing-dev.h> +#include <linux/list.h> +#include <linux/pfn_t.h> +#include <linux/dax.h> +#include <linux/genhd.h> +#include <linux/cdev.h> +#include <uapi/linux/mount.h> +#include "euler.h" +#include "dht.h" +#include "dep.h" +#include "nvalloc.h" +#include "wear.h" + +int support_clwb; +int support_clflushopt; +int support_clflush; +int force_nocache_write; +int persist_period = -4; +int persisters_per_socket = 1; +int max_dirty_inodes = 1000000; +int max_dep_nodes = 1000000; +int wear_control; +int wear_threshold = 100000; +int wear_alloc_threshold = 10000; + +module_param(persisters_per_socket, int, 0444); +MODULE_PARM_DESC(persisters_per_socket, "Num of Persisters per socket"); +module_param(force_nocache_write, int, 0444); +MODULE_PARM_DESC(force_nocache_write, "Force to use nocache data write"); +module_param(persist_period, int, 0444); +MODULE_PARM_DESC(persist_period, "Period to wake persisters up"); +module_param(max_dirty_inodes, int, 0444); +MODULE_PARM_DESC(max_dirty_inodes, + "Limit the max number of dirty inodes allowed"); +module_param(max_dep_nodes, int, 0444); +MODULE_PARM_DESC(max_dep_nodes, "Limit the max number of dep nodes allowed"); +module_param(wear_control, int, 0444); +MODULE_PARM_DESC(wear_control, "Control wear leveling"); +module_param(wear_threshold, int, 0444); +MODULE_PARM_DESC(wear_threshold, "Wear leveling threshold"); +module_param(wear_alloc_threshold, int, 0444); +MODULE_PARM_DESC(wear_alloc_threshold, + "Wear leveling threshold for allocation"); + +int num_sockets; + +static struct super_operations eufs_sops; + +void eufs_error_mng(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + eufs_info("euler error: "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + pr_crit("euler err: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; +} + +static void eufs_show_params(void) +{ + eufs_info("params: force_nocache_write=%d\n", force_nocache_write); + eufs_info("params: persist_period=%d\n", persist_period); + eufs_info("params: persisters_per_socket=%d\n", persisters_per_socket); +} + +static void eufs_detect_features(void) +{ + support_clwb = support_clflushopt = support_clflush = 0; + if (arch_has_clwb()) { + eufs_info("arch has CLWB support\n"); + support_clwb = 1; + } + + if (arch_has_clflushopt()) { + eufs_info("arch has CLFLUSHOPT support\n"); + support_clflushopt = 1; + } + + if (arch_has_clflush()) { + eufs_info("arch has CLFLUSH support\n"); + support_clflush = 1; + } + + if (!support_clwb && !support_clflushopt && !support_clflush) + eufs_info("arch has no cache flush support\n"); +} + +static int eufs_get_block_info(struct super_block *sb, struct eufs_sb_info *sbi) +{ + void *virt_addr = NULL; + pfn_t pfn; + long size; + struct dax_device *dax_dev; + int srcu_id; + + if (!bdev_dax_supported(sb->s_bdev, PAGE_SIZE)) { + eufs_err(sb, "device does not support DAX\n"); + return -EINVAL; + } + + dax_dev = dax_get_by_host(sb->s_bdev->bd_disk->disk_name); + if (!dax_dev) { + eufs_err(sb, "device does not support DAX\n"); + return -EINVAL; + } + + srcu_id = dax_read_lock(); + size = dax_direct_access( + dax_dev, 0, i_size_read(sb->s_bdev->bd_inode) >> PAGE_SHIFT, + &virt_addr, &pfn); + dax_read_unlock(srcu_id); + if (size < 0) { + fs_put_dax(dax_dev); + eufs_err(sb, "device DAX error %ld\n", size); + return size; + } + + sbi->s_dax_dev = dax_dev; + sbi->s_bdev = sb->s_bdev; + sbi->virt_addr = virt_addr; + sbi->phys_addr = pfn_t_to_pfn(pfn) << PAGE_SHIFT; + sbi->initsize = (u64)size << PAGE_SHIFT; + + eufs_info("dev %s virt_addr %px phys_addr %llx size %ld\n", + sb->s_bdev->bd_disk->disk_name, sbi->virt_addr, + sbi->phys_addr, sbi->initsize); + + return 0; +} + +enum { + Opt_init, + Opt_dax, + Opt_err +}; + +static const match_table_t tokens = { + { Opt_init, "init" }, + { Opt_dax, "dax" }, /* DAX is always on. This is for compatibility. */ + { Opt_err, NULL }, +}; + +static int eufs_parse_options(char *options, struct eufs_sb_info *sbi, + bool remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_init: + if (remount) + goto bad_opt; + set_opt(sbi->s_mount_opt, FORMAT); + break; + case Opt_dax: + break; + default: + goto bad_opt; + } + } + + return 0; + +bad_opt: + eufs_info("Bad mount option: "%s"\n", p); + return -EINVAL; +} + +static bool eufs_check_size(struct super_block *sb, unsigned long size) +{ + unsigned long minimum_size; + + /* For Super Block */ + minimum_size = 2 << sb->s_blocksize_bits; + /* For Bitmaps */ + minimum_size += size / EUFS_BLOCK_SIZE / 8; + + if (size < minimum_size) + return false; + + return true; +} + +static __always_inline int eufs_check_super(struct eufs_super_block *ps, + const char *typ) +{ + u16 save_crc = 0; + u16 calc_crc = 0; + struct eufs_super_block scratch; + + memcpy(&scratch, ps, sizeof(*ps)); + save_crc = scratch.s_sum; + scratch.s_sum = 0; + scratch.s_safe_umount = 0; + calc_crc = crc16(~0, (__u8 *)&scratch, sizeof(scratch)); + if (save_crc != calc_crc) { + eufs_warn("Recognizing %s super block failed: crc %x mismatch (%x expected)", + typ, calc_crc, save_crc); + return -EIO; + } + if (scratch.s_magic != EUFS_SUPER_MAGIC) { + eufs_warn("Recognizing %s super block failed: magic %x mismatch (%x expected)", + typ, scratch.s_magic, EUFS_SUPER_MAGIC); + return -EIO; + } + return 0; +} + +static __always_inline int eufs_recognize_fs(struct super_block *sb) +{ + struct eufs_super_block *super; + struct eufs_super_block *super2; + int err; + + super = eufs_get_super(sb); + super2 = (void *)super + EUFS_SB2_OFFSET; + err = eufs_check_super(super, "primary"); + if (err) { + err = eufs_check_super(super2, "secondary"); + if (err) + return -EIO; + + eufs_info("Secondary super block recognized, syncing back to the primary.\n"); + memcpy(super, super2, sizeof(struct eufs_super_block)); + eufs_flush_buffer(super2, sizeof(*super2), false); + eufs_pbarrier(); + } + return 0; +} + +static __always_inline void eufs_sync_super(struct eufs_super_block *ps) +{ + u16 crc = 0; + __le32 saved_safe_umount = ps->s_safe_umount; + + ps->s_safe_umount = 0; + ps->s_wtime = cpu_to_le32(get_seconds()); + ps->s_sum = 0; + crc = crc16(~0, (__u8 *)ps, sizeof(struct eufs_super_block)); + ps->s_sum = cpu_to_le16(crc); + + eufs_flush_buffer(ps, sizeof(*ps), false); + eufs_pbarrier(); + + /* Keep sync redundant super block */ + memcpy((void *)ps + EUFS_SB2_OFFSET, (void *)ps, + sizeof(struct eufs_super_block)); + eufs_flush_buffer((void *)ps + EUFS_SB2_OFFSET, sizeof(*ps), false); + eufs_pbarrier(); + ps->s_safe_umount = saved_safe_umount; +} + +static struct eufs_inode *eufs_init(struct super_block *sb, unsigned long size) +{ + struct eufs_inode __pmem *root_i; + struct eufs_super_block __pmem *super; + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct nv_dict *dict; + + eufs_info("creating an empty eulerfs of size %lu\n", size); + + sbi->block_start = 0; + sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT); + + if (!sbi->virt_addr) { + eufs_err(sb, "mapping eulerfs image failed\n"); + return ERR_PTR(-EINVAL); + } + + sb->s_blocksize_bits = EUFS_BLOCK_SIZE_BITS; + sbi->blocksize = EUFS_BLOCK_SIZE; + + if (!eufs_check_size(sb, size)) { + eufs_err(sb, "Specified size too small 0x%lx for EulerFS\n", + size); + return ERR_PTR(-EINVAL); + } + + super = eufs_get_super(sb); + + super->s_sum = 0; + super->s_magic = cpu_to_le16(EUFS_SUPER_MAGIC); + super->s_safe_umount = 0; + super->s_flag = 0; + super->s_fs_version = cpu_to_le16(1); + super->s_size = cpu_to_le64(size); + super->s_virt_addr = cpu_to_le64(sbi->virt_addr); + + sbi->s_crash_ver = 1; + super->s_crash_ver = cpu_to_le64(1); + + nv_init(sb, true); + super->s_page_map = cpu_to_le64(p2o(sb, sbi->page_map)); + super->s_mtime = 0; + + root_i = eufs_malloc_pinode(sb); + if (!root_i) + return ERR_PTR(-ENOSPC); + + eufs_info("root_i: %px\n", root_i); + eufs_alloc_persist(sb, root_i, false); + + super->s_root_pi = p2s(sb, root_i); + eufs_sync_super(super); + + /* ================ init root dir =============== */ + eufs_iwrite_flags(root_i, 0); + eufs_iwrite_mode(root_i, S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR); + eufs_iwrite_version(root_i, 1); + eufs_iwrite_ctime(root_i, get_seconds()); + eufs_iwrite_ctime_nsec(root_i, 0); + eufs_iwrite_uid(root_i, from_kuid(&init_user_ns, current_fsuid())); + eufs_iwrite_gid(root_i, from_kgid(&init_user_ns, current_fsgid())); + eufs_iwrite_dotdot(root_i, p2o(sb, root_i)); + eufs_iwrite_ext(root_i, 0); /* no ext here */ + eufs_iwrite_generation(root_i, 0); + eufs_iwrite_nlink(root_i, 2); + eufs_iwrite_mtime(root_i, get_seconds()); + eufs_iwrite_atime(root_i, get_seconds()); + eufs_iwrite_mtime_nsec(root_i, 0); + eufs_iwrite_atime_nsec(root_i, 0); + dict = eufs_zalloc_htable(sb); + if (!dict) + return ERR_PTR(-ENOSPC); + eufs_alloc_persist(sb, dict, false); + eufs_flush_range(dict, sizeof(struct nv_dict)); + + eufs_iwrite_dict(root_i, p2o(sb, dict)); + eufs_iwrite_size(root_i, 0); + + root_i->i_fresh = 2; + eufs_flush_cacheline(root_i); + eufs_flush_cacheline(&root_i->i_fresh); + EUFS_TWIN_PI(root_i)->i_fresh = 1; + eufs_flush_cacheline(&EUFS_TWIN_PI(root_i)->i_fresh); + + eufs_pbarrier(); + return root_i; +} + +static void eufs_destroy_super(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + wear_fini(sb); + + dep_fini(sb); + + nv_fini(sb); + + if (sbi->virt_addr) + sbi->virt_addr = NULL; + if (sbi->s_dax_dev) + fs_put_dax(sbi->s_dax_dev); + + sb->s_fs_info = NULL; + + kfree(sbi); +} + +static int eufs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct eufs_super_block __pmem *super; + struct eufs_inode __pmem *root_pi; + struct eufs_sb_info *sbi = NULL; + struct inode *root_i = NULL; + u32 random = 0; + int err; + + BUILD_BUG_ON(sizeof(struct eufs_super_block) > EUFS_SB_SIZE); + BUILD_BUG_ON(sizeof(struct eufs_inode) != 2 * CACHELINE_SIZE); + BUILD_BUG_ON(sizeof(struct nv_dict_entry) != CACHELINE_SIZE); + + eufs_detect_features(); + + sbi = kzalloc(sizeof(struct eufs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_draining = false; + init_waitqueue_head(&sbi->s_draining_wq); + atomic_set(&sbi->s_nr_dirty_inodes, 0); + atomic_set(&sbi->s_nr_dep_nodes, 0); + + sb->s_fs_info = sbi; + + err = eufs_get_block_info(sb, sbi); + if (err) + goto out; + + get_random_bytes(&random, sizeof(u32)); + atomic_set(&sbi->next_generation, random); + + mutex_init(&sbi->s_lock); + mutex_init(&sbi->gather_mutex); + mutex_init(&sbi->sync_mutex); + + err = eufs_parse_options(data, sbi, 0); + if (err) + goto out; + + super = eufs_get_super(sb); + + /* Init a new EulerFS instance */ + if (test_opt(sb, FORMAT)) { + root_pi = eufs_init(sb, sbi->initsize); + if (IS_ERR(root_pi)) { + err = PTR_ERR(root_pi); + goto out; + } + + goto setup_sb; + } + + err = eufs_recognize_fs(sb); + if (err) { + eufs_crit("No valid EulerFS found. Are you trying to mount a wrong fs?\n"); + goto out; + } + + sbi->block_start = 0; + sbi->block_end = ((unsigned long)(super->s_size) >> PAGE_SHIFT); + sb->s_blocksize_bits = EUFS_BLOCK_SIZE_BITS; + sbi->blocksize = EUFS_BLOCK_SIZE; + + sbi->page_map = (void *)o2p(sb, super->s_page_map); + sbi->initsize = (u64)super->s_size; + eufs_get_layout(sb, false); + + sbi->s_crash_ver = le64_to_cpu(super->s_crash_ver); + + if (!super->s_safe_umount) { + super->s_crash_ver = cpu_to_le64(++sbi->s_crash_ver); + eufs_flush_cacheline(&super->s_crash_ver); + eufs_pbarrier(); + } + + nv_init(sb, false); + + root_pi = (struct eufs_inode *)s2p(sb, super->s_root_pi); + +setup_sb: + super->s_safe_umount = 0; + eufs_flush_cacheline(&super->s_safe_umount); + eufs_pbarrier(); + + sbi->s_crash_ver = le64_to_cpu(super->s_crash_ver); + + sb->s_magic = le16_to_cpu(super->s_magic); + sb->s_op = &eufs_sops; + sb->s_maxbytes = EUFS_MAX_FILE_SIZE; + sb->s_time_gran = NSEC_PER_SEC; + + err = dep_init(sb); + if (err) + goto out; + + wear_init(sb); + + root_i = eufs_iget(sb, root_pi); + if (IS_ERR(root_i)) { + err = PTR_ERR(root_i); + goto out; + } + + sb->s_root = d_make_root(root_i); + if (!sb->s_root) { + eufs_err(sb, "alloc root dentry failed\n"); + err = -ENOMEM; + goto out; + } + + if (!(sb->s_flags & MS_RDONLY)) { + u64 mnt_write_time; + /* update mount time and write time atomically. */ + mnt_write_time = (get_seconds() & 0xFFFFFFFF); + mnt_write_time = mnt_write_time | (mnt_write_time << 32); + + super->s_mtime = mnt_write_time; + + eufs_flush_buffer(&super->s_mtime, 8, false); + eufs_pbarrier(); + } + + return 0; + +out: + eufs_destroy_super(sb); + return err; +} + +static int eufs_statfs(struct dentry *d, struct kstatfs *buf) +{ + struct super_block *sb = d->d_sb; + struct eufs_sb_info *sbi = (struct eufs_sb_info *)sb->s_fs_info; + + u64 npage, ncl; + + nv_stat(sbi, &npage, &ncl); + + buf->f_type = EUFS_SUPER_MAGIC; + buf->f_bsize = PAGE_SIZE; + + buf->f_blocks = sbi->block_end; + + buf->f_bfree = npage; + buf->f_bavail = npage; + + buf->f_files = ncl; + buf->f_ffree = ncl; + + buf->f_namelen = EUFS_MAX_NAME_LEN; + print_stats(sbi); + + return 0; +} + +static int eufs_show_options(struct seq_file *seq, struct dentry *root) +{ + seq_puts(seq, ",dax"); + + return 0; +} + +static int eufs_remount(struct super_block *sb, int *mntflags, char *data) +{ + unsigned long old_sb_flags; + unsigned long old_mount_opt; + struct eufs_super_block *ps; + struct eufs_sb_info *sbi = EUFS_SB(sb); + int ret = -EINVAL; + + /* Store the old options */ + mutex_lock(&sbi->s_lock); + old_sb_flags = sb->s_flags; + old_mount_opt = sbi->s_mount_opt; + + if (eufs_parse_options(data, sbi, 1)) + goto restore_opt; + + if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + u64 mnt_write_time; + + ps = eufs_get_super(sb); + /* update mount time and write time atomically. */ + mnt_write_time = (get_seconds() & 0xFFFFFFFF); + mnt_write_time = mnt_write_time | (mnt_write_time << 32); + + ps->s_mtime = mnt_write_time; + + eufs_flush_buffer(&ps->s_mtime, 8, false); + eufs_pbarrier(); + } + + mutex_unlock(&sbi->s_lock); + ret = 0; + return ret; + +restore_opt: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_mount_opt; + mutex_unlock(&sbi->s_lock); + return ret; +} + +static void eufs_put_super(struct super_block *sb) +{ + struct eufs_super_block *super; + + super = eufs_get_super(sb); + + eufs_sync_super(super); + + super->s_safe_umount = 1; + eufs_flush_cacheline(&super->s_safe_umount); + eufs_pbarrier(); + + eufs_info("safe unmount.\n"); + eufs_destroy_super(sb); +} + +static struct inode *eufs_alloc_inode(struct super_block *sb) +{ + struct eufs_inode_info *vi; + + vi = eufs_alloc_vi(); + if (!vi) + return NULL; + + INIT_LIST_HEAD(&vi->i_dep_list); + + vi->i_next_dep_seq = 1; + vi->i_persisted_dep_seq = 0; + + spin_lock_init(&vi->i_owner_lock); + INIT_LIST_HEAD(&vi->i_owner_list); + + vi->i_lock_transferred = I_TRANS_NONE; + vi->i_is_persisting = false; + vi->i_is_dirty = false; + + vi->i_volatile_root = NULL; + vi->i_volatile_height = 0; + + vi->i_dotdot = 0; + + atomic64_set(&vi->vfs_inode.i_version, 1); + + vi->page_batch.size = 0; + vi->page_batch.n_used = -1; + vi->page_batch.batch = NULL; + INIT_LIST_HEAD(&vi->page_batch.list); + + vi->i_volatile_dict = NULL; + + mutex_init(&vi->i_urgent_mutex); + mutex_init(&vi->i_dep_lock); + mutex_init(&vi->i_header_lock); + + init_rwsem(&vi->mmap_rwsem); + spin_lock_init(&vi->i_dentry_persist_lock); + mutex_init(&vi->i_leaf_lock); + + return &vi->vfs_inode; +} + +static void eufs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + eufs_alloc_batch_fini(&EUFS_I(inode)->page_batch); + eufs_free_vi(EUFS_I(inode)); +} + +static void eufs_destroy_inode(struct inode *inode) +{ + if (EUFS_I(inode)->i_volatile_dict) { + eufs_free_page(EUFS_I(inode)->i_volatile_dict); + EUFS_I(inode)->i_volatile_dict = NULL; + } + call_rcu(&inode->i_rcu, eufs_i_callback); +} + +static int eufs_sync_fs(struct super_block *sb, int sync) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + int i; + int num_persisters = num_sockets * persisters_per_socket; + int wait_flag; + + if (!sync) + return 0; + + mutex_lock(&sbi->sync_mutex); + + for (i = 0; i < num_persisters; i++) + sbi->need_sync[i] = true; + + /* FIXME: Persisters may miss the wake-up message. */ + for (i = 0; i < num_persisters; ++i) + wake_up_process(sbi->persisters[i]); + + do { + wait_flag = false; + for (i = 0; i < num_persisters; i++) { + if (sbi->need_sync[i] == false) + continue; + wait_flag = true; + wait_event_interruptible(sbi->sync_wq, + (sbi->need_sync[i] == false)); + } + } while (wait_flag); + + mutex_unlock(&sbi->sync_mutex); + + return 0; +} + +/* + * the super block writes are all done "on the fly", so the + * super block is never in a "dirty" state, so there's no need + * for write_super. + */ +static struct super_operations eufs_sops = { + .alloc_inode = eufs_alloc_inode, + .destroy_inode = eufs_destroy_inode, + .write_inode = eufs_write_inode, + .evict_inode = eufs_evict_inode, + .put_super = eufs_put_super, + .statfs = eufs_statfs, + .remount_fs = eufs_remount, + .show_options = eufs_show_options, + .sync_fs = eufs_sync_fs, +}; + +static struct dentry *eufs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, eufs_fill_super); +} + +static struct file_system_type eufs_fs_type = { + .owner = THIS_MODULE, + .name = "eulerfs", + .mount = eufs_mount, + .kill_sb = kill_block_super, +}; + +static int __init init_eufs_fs(void) +{ + int rc = 0; + int cpu; + + BUILD_BUG_ON(sizeof(struct eufs_renamej) != 2 * CACHELINE_SIZE); + + rc = init_page_cache(); + if (rc) + goto out1; + + rc = init_inodecache(); + if (rc) + goto out2; + + rc = init_dep_node_cache(); + if (rc) + goto out3; + + rc = register_filesystem(&eufs_fs_type); + if (rc) + goto out4; + + num_sockets = 0; + for_each_possible_cpu(cpu) { + int sock = cpu_to_node(cpu); + + if (sock > num_sockets) + num_sockets = sock; + } + num_sockets += 1; + eufs_info("Num socket: %d\n", num_sockets); + + eufs_show_params(); + + return 0; + +out4: + destroy_dep_node_cache(); +out3: + destroy_inodecache(); +out2: + destroy_page_cache(); +out1: + return rc; +} + +static void __exit exit_eufs_fs(void) +{ + unregister_filesystem(&eufs_fs_type); + destroy_inodecache(); + destroy_dep_node_cache(); + destroy_page_cache(); +} + +module_init(init_eufs_fs); +module_exit(exit_eufs_fs); + +MODULE_DESCRIPTION("EulerFS"); +MODULE_LICENSE("GPL");
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/eulerfs/Kconfig | 10 ++++++++++ fs/eulerfs/Makefile | 9 +++++++++ 4 files changed, 21 insertions(+) create mode 100644 fs/eulerfs/Kconfig create mode 100644 fs/eulerfs/Makefile
diff --git a/fs/Kconfig b/fs/Kconfig index 83c56571d0bc..3cc647e00f3c 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -354,6 +354,7 @@ source "fs/cifs/Kconfig" source "fs/coda/Kconfig" source "fs/afs/Kconfig" source "fs/9p/Kconfig" +source "fs/eulerfs/Kconfig"
endif # NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile index 01d36815c3a5..fec76c1b4e06 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -139,3 +139,4 @@ obj-$(CONFIG_RESCTRL) += resctrlfs.o obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ +obj-$(CONFIG_EULER_FS) += eulerfs/ diff --git a/fs/eulerfs/Kconfig b/fs/eulerfs/Kconfig new file mode 100644 index 000000000000..6328684f0b6a --- /dev/null +++ b/fs/eulerfs/Kconfig @@ -0,0 +1,10 @@ +config EULER_FS + tristate "EulerFS: filesystem for non-volatile memories" + depends on ARM64 || X86_64 + depends on FS_DAX + select CRC32 + select LIBCRC32C + help + EulerFS is NVDIMM filesystem. It uses soft updates and + pointer-based dual views to delay synchronous cache flushes + and reduce latency significantly in critical path. diff --git a/fs/eulerfs/Makefile b/fs/eulerfs/Makefile new file mode 100644 index 000000000000..706e6ebff77e --- /dev/null +++ b/fs/eulerfs/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for eulerfs +# + +obj-$(CONFIG_EULER_FS) += eulerfs.o + +eulerfs-y := dir.o file.o inode.o namei.o super.o symlink.o +eulerfs-y += dax.o dht.o dep.o nvalloc.o wear.o +eulerfs-y += kmem_cache.o