From: Hongbo Li <lihongbo22@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/ID5W1P -------------------------------- When the read request miss in cache (memory or disk), MFS will post miss event to userspace. And we should use different way to check. For memory, we just need to check the target range whether in page cache or not. And for disk (remote mode), we should use SEEK_DATA and SEEK_HOLE to check the target range. In this patch, we introduce the basic structure for events that is @mfs_syncer. In remote mode, the events are synchronous, so all the io segments from the same reading range will wait on the same syncer object. Also the @mfs_cache_object is introduced to imply the cache object in cache layer. @mfs_cache_object is the controller for MFS's data. It builds the connection between MFS's data and the events. Signed-off-by: Huang Xiaojia <huangxiaojia2@huawei.com> Signed-off-by: Hongbo Li <lihongbo22@huawei.com> --- fs/mfs/Makefile | 2 +- fs/mfs/cache.c | 89 +++++++++++++++ fs/mfs/data.c | 229 ++++++++++++++++++++++++++++++++++++++- fs/mfs/inode.c | 8 ++ fs/mfs/internal.h | 36 ++++++ fs/mfs/super.c | 9 ++ include/uapi/linux/mfs.h | 6 + 7 files changed, 377 insertions(+), 2 deletions(-) create mode 100644 fs/mfs/cache.c diff --git a/fs/mfs/Makefile b/fs/mfs/Makefile index 546e620daf8d..a3fe71ba61e8 100644 --- a/fs/mfs/Makefile +++ b/fs/mfs/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MFS_FS) += mfs.o -mfs-objs := super.o inode.o data.o +mfs-objs := super.o inode.o data.o cache.o diff --git a/fs/mfs/cache.c b/fs/mfs/cache.c new file mode 100644 index 000000000000..30d5c0f986ee --- /dev/null +++ b/fs/mfs/cache.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include "internal.h" + +/* + * Used for cache object + */ +static struct kmem_cache *mfs_cobject_cachep; + +static int mfs_setup_object(struct mfs_cache_object *object, + struct inode *inode, + struct path *cache_path) +{ + struct inode *cache_inode = d_inode(cache_path->dentry); + struct file *cache_file; + int flags = O_RDONLY; + + if (need_sync_event(inode->i_sb)) + flags = O_RDWR; + cache_file = kernel_file_open(cache_path, flags | O_LARGEFILE, + cache_inode, current_cred()); + if (IS_ERR(cache_file)) + return PTR_ERR(cache_file); + /* + * object belongs to a mfs inode, + * this is a reverse pointer, no refcount needed. + */ + object->mfs_inode = inode; + object->cache_file = cache_file; + init_rwsem(&object->rwsem); + object->fd = -1; + object->anon_file = NULL; + return 0; +} + +void mfs_post_event_read(struct mfs_cache_object *object, + loff_t off, uint64_t len, + struct mfs_syncer *syncer, int op) +{ +} + +void mfs_cancel_syncer_events(struct mfs_cache_object *object, + struct mfs_syncer *syncer) +{ +} + +struct mfs_cache_object *mfs_alloc_object(struct inode *inode, + struct path *cache_path) +{ + struct mfs_cache_object *object; + int err; + + object = kmem_cache_alloc(mfs_cobject_cachep, GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + err = mfs_setup_object(object, inode, cache_path); + if (err) { + kmem_cache_free(mfs_cobject_cachep, object); + return ERR_PTR(err); + } + + return object; +} + +void mfs_free_object(void *data) +{ + struct mfs_cache_object *object = (struct mfs_cache_object *)data; + + fput(object->cache_file); + kmem_cache_free(mfs_cobject_cachep, object); +} + +int mfs_cache_init(void) +{ + mfs_cobject_cachep = + kmem_cache_create("mfs_object", + sizeof(struct mfs_cache_object), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!mfs_cobject_cachep) + return -ENOMEM; + return 0; +} + +void mfs_cache_exit(void) +{ + kmem_cache_destroy(mfs_cobject_cachep); +} diff --git a/fs/mfs/data.c b/fs/mfs/data.c index f7fa8f4e7102..2cd28e0b0222 100644 --- a/fs/mfs/data.c +++ b/fs/mfs/data.c @@ -6,6 +6,7 @@ #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/types.h> +#include <linux/completion.h> static struct mfs_file_info *mfs_file_info_alloc(struct file *lower, struct file *cache) { @@ -132,11 +133,179 @@ static int mfs_readdir(struct file *file, struct dir_context *ctx) return iterate_dir(lfile, ctx); } +enum range_status { + RANGE_DATA, + RANGE_HOLE, + RANGE_INVAL, +}; + +/* Continuous range with same status */ +struct range_t { + struct file *file; + loff_t off; + size_t max; + size_t len; + int status; +}; + +typedef int (*range_check) (struct range_t *r); + +struct range_ctx { + bool sync; /* handle the miss case in sync/async way */ + int op; + loff_t off; + size_t len; + struct file *file; + struct mfs_cache_object *object; + range_check checker; /* check method for range */ +}; + +static int range_check_disk(struct range_t *r) +{ + loff_t off, to, start = r->off, end = r->off + r->max; + struct file *file = r->file; + int err = 0; + + off = vfs_llseek(file, start, SEEK_DATA); + if (off < 0) { + if (off == (loff_t)-ENXIO) { + r->len = end - start; + r->status = RANGE_HOLE; + goto out; + } + err = (int)off; + goto out; + } + if (off >= end) { + r->len = end - start; + r->status = RANGE_HOLE; + goto out; + } + if (off > start) { + r->len = end - off; + r->status = RANGE_HOLE; + goto out; + } + to = vfs_llseek(file, start, SEEK_HOLE); + if (to < 0) { + err = (int)to; + goto out; + } + if (to < end) { + r->len = to - start; + r->status = RANGE_DATA; + goto out; + } + r->len = end - start; + r->status = RANGE_DATA; +out: + return err; +} + +static int range_check_mem(struct range_t *r) +{ + struct inode *inode = file_inode(r->file); + struct address_space *mapping = inode->i_mapping; + loff_t cur_off = r->off, end = r->off + r->max; + struct folio *folio; + + /* check from the first folio */ + folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT); + if (IS_ERR(folio)) { + r->status = RANGE_HOLE; + cur_off += PAGE_SIZE; + } else { + r->status = RANGE_DATA; + cur_off += folio_size(folio); + folio_put(folio); + } + + while (cur_off < end) { + folio = filemap_get_folio(mapping, cur_off >> PAGE_SHIFT); + if (IS_ERR(folio)) { + if (r->status == RANGE_DATA) + break; + /* continuous hole */ + cur_off += PAGE_SIZE; + continue; + } + if (r->status == RANGE_HOLE) { + folio_put(folio); + break; + } + cur_off += folio_size(folio); + folio_put(folio); + } + + r->len = cur_off - r->off; + return 0; +} + +static int mfs_check_range(struct range_ctx *ctx) +{ + loff_t start = ctx->off, end = ctx->off + ctx->len; + struct file *file = ctx->file; + struct range_t r = { .file = file }; + size_t len = ctx->len; + struct mfs_syncer syncer; + int err = 0, err2 = 0; + + if (!ctx->len) + return 0; + + atomic_set(&syncer.notback, 1); + init_completion(&syncer.done); + INIT_LIST_HEAD(&syncer.head); + spin_lock_init(&syncer.list_lock); + atomic_set(&syncer.res, 0); + while (start < end) { + r.off = round_down(start, PAGE_SIZE); + r.max = len + (start - r.off); + r.len = 0; + r.status = RANGE_INVAL; + err = ctx->checker(&r); + if (err) + goto err; + switch (r.status) { + case RANGE_DATA: + start += r.len; + len -= r.len; + break; + case RANGE_HOLE: + start += r.len; + len -= r.len; + if (ctx->sync) + mfs_post_event_read(ctx->object, r.off, r.len, &syncer, ctx->op); + else + mfs_post_event_read(ctx->object, r.off, r.len, NULL, ctx->op); + break; + default: + pr_warn("invalid range status:%d\n", r.status); + WARN_ON_ONCE(1); + err = -EINVAL; + goto err; + } + } + +err: + if (atomic_dec_return(&syncer.notback) > 0) { + err2 = wait_for_completion_interruptible(&syncer.done); + if (err2) + mfs_cancel_syncer_events(ctx->object, &syncer); + else + err = atomic_read(&syncer.res); + } + return err ?: err2; +} + static ssize_t mfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *cfile, *file = iocb->ki_filp; struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); + struct range_ctx ctx; ssize_t rsize; + int err; if (!iov_iter_count(to)) return 0; @@ -146,6 +315,22 @@ static ssize_t mfs_read_iter(struct kiocb *iocb, struct iov_iter *to) return -EINVAL; (void)get_file(cfile); + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = iocb->ki_pos; + ctx.op = MFS_OP_READ; + ctx.len = min(isize - ctx.off, iov_iter_count(to)); + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { + fput(cfile); + return err; + } iocb->ki_filp = cfile; rsize = cfile->f_op->read_iter(iocb, to); @@ -181,9 +366,12 @@ static vm_fault_t mfs_filemap_fault(struct vm_fault *vmf) { struct file *cfile, *file = vmf->vma->vm_file; struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); const struct vm_operations_struct *cvm_ops; struct vm_area_struct cvma, *vma, **vma_; + struct range_ctx ctx; vm_fault_t ret; + int err; vma = vmf->vma; memcpy(&cvma, vma, sizeof(struct vm_area_struct)); @@ -193,8 +381,26 @@ static vm_fault_t mfs_filemap_fault(struct vm_fault *vmf) if (unlikely(!cvm_ops->fault)) return VM_FAULT_SIGBUS; + if ((vmf->pgoff << PAGE_SHIFT) >= isize) + return VM_FAULT_SIGBUS; (void)get_file(cfile); + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = vmf->pgoff << PAGE_SHIFT; + ctx.len = min(isize - ctx.off, PAGE_SIZE); + ctx.op = MFS_OP_FAULT; + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { + fput(cfile); + return VM_FAULT_SIGBUS; + } /* * Dealing fault in mfs will call cachefile's fault eventually, @@ -216,17 +422,38 @@ vm_fault_t mfs_filemap_map_pages(struct vm_fault *vmf, { struct file *cfile, *file = vmf->vma->vm_file; struct mfs_file_info *fi = file->private_data; + size_t isize = i_size_read(file_inode(file)); const struct vm_operations_struct *cvm_ops; struct vm_area_struct cvma, *vma, **vma_; + struct range_ctx ctx; vm_fault_t ret; + int err; vma = vmf->vma; memcpy(&cvma, vma, sizeof(struct vm_area_struct)); cfile = fi->cache; cvm_ops = fi->cache_vm_ops; cvma.vm_file = cfile; + + if (unlikely(!cvm_ops->map_pages)) + return 0; + if ((start_pgoff << PAGE_SHIFT) >= isize) + return 0; + (void)get_file(cfile); - if (unlikely(!cvm_ops->map_pages)) { + ctx.file = cfile; + ctx.object = file_inode(file)->i_private; + ctx.off = start_pgoff << PAGE_SHIFT; + ctx.len = min(isize - ctx.off, (end_pgoff - start_pgoff) << PAGE_SHIFT); + ctx.op = MFS_OP_FAROUND; + ctx.sync = false; + ctx.checker = range_check_mem; + if (need_sync_event(file_inode(file)->i_sb)) { + ctx.sync = true; + ctx.checker = range_check_disk; + } + err = mfs_check_range(&ctx); + if (err) { fput(cfile); return 0; } diff --git a/fs/mfs/inode.c b/fs/mfs/inode.c index 3e4d24c517cb..3edd9bb3afa1 100644 --- a/fs/mfs/inode.c +++ b/fs/mfs/inode.c @@ -275,6 +275,14 @@ struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode, goto err_inode; } inode->i_mapping->a_ops = &mfs_aops; + if (S_ISREG(cache_inode->i_mode)) { + vi->vfs_inode.i_private = mfs_alloc_object(inode, cache_path); + if (IS_ERR(vi->vfs_inode.i_private)) { + err = PTR_ERR(vi->vfs_inode.i_private); + vi->vfs_inode.i_private = NULL; + goto err_inode; + } + } vi->lower = lower_inode; vi->cache = cache_inode; fsstack_copy_attr_all(inode, lower_inode); diff --git a/fs/mfs/internal.h b/fs/mfs/internal.h index ca4fc42d578b..295adc0794a3 100644 --- a/fs/mfs/internal.h +++ b/fs/mfs/internal.h @@ -9,12 +9,30 @@ #include <linux/mm.h> #include <linux/container_of.h> #include <linux/spinlock_types.h> +#include <linux/completion.h> #include <linux/mfs.h> #define MFS_NAME "mfs" #define MFS_OPEN_FLAGS (O_NOATIME) +struct mfs_cache_object { + struct file *cache_file; + struct inode *mfs_inode; + + struct rw_semaphore rwsem; + int fd; /* file handle */ + struct file *anon_file; /* related with fd */ +}; + +struct mfs_syncer { + atomic_t notback; + struct list_head head; + spinlock_t list_lock; + struct completion done; + atomic_t res; +}; + struct mfs_sb_info { int mode; char *mtree; @@ -130,9 +148,27 @@ static inline bool support_event(struct mfs_sb_info *sbi) return sbi->mode != MFS_MODE_NONE; } +static inline bool need_sync_event(struct super_block *sb) +{ + struct mfs_sb_info *sbi = MFS_SB(sb); + + return sbi->mode == MFS_MODE_REMOTE; +} + struct inode *mfs_iget(struct super_block *sb, struct inode *lower_inode, struct path *cache_path); int mfs_alloc_dentry_info(struct dentry *dentry); void mfs_free_dentry_info(struct dentry *dentry); +void mfs_post_event_read(struct mfs_cache_object *object, + loff_t off, uint64_t len, + struct mfs_syncer *syncer, int op); +void mfs_cancel_syncer_events(struct mfs_cache_object *object, + struct mfs_syncer *syncer); +struct mfs_cache_object *mfs_alloc_object(struct inode *inode, + struct path *cache_path); +void mfs_free_object(void *data); +int mfs_cache_init(void); +void mfs_cache_exit(void); + #endif diff --git a/fs/mfs/super.c b/fs/mfs/super.c index f6bc2739f350..1f715b93cb03 100644 --- a/fs/mfs/super.c +++ b/fs/mfs/super.c @@ -53,6 +53,8 @@ static void mfs_evict_inode(struct inode *inode) struct inode *cache_inode = vi->cache; truncate_inode_pages_final(&inode->i_data); + if (inode->i_private) + mfs_free_object(inode->i_private); clear_inode(inode); if (lower_inode) { vi->lower = NULL; @@ -420,6 +422,10 @@ static int __init init_mfs_fs(void) goto err_dentryp; } + err = mfs_cache_init(); + if (err) + goto err_cache; + err = register_filesystem(&mfs_fs_type); if (err) goto err_register; @@ -427,6 +433,8 @@ static int __init init_mfs_fs(void) pr_info("MFS module loaded\n"); return 0; err_register: + mfs_cache_exit(); +err_cache: kmem_cache_destroy(mfs_dentry_cachep); err_dentryp: kmem_cache_destroy(mfs_inode_cachep); @@ -436,6 +444,7 @@ static int __init init_mfs_fs(void) static void __exit exit_mfs_fs(void) { unregister_filesystem(&mfs_fs_type); + mfs_cache_exit(); kmem_cache_destroy(mfs_dentry_cachep); kmem_cache_destroy(mfs_inode_cachep); pr_info("MFS module unload\n"); diff --git a/include/uapi/linux/mfs.h b/include/uapi/linux/mfs.h index 81feff7b7fe0..78c4b57a83f5 100644 --- a/include/uapi/linux/mfs.h +++ b/include/uapi/linux/mfs.h @@ -6,6 +6,12 @@ #include <linux/types.h> #include <linux/ioctl.h> +enum mfs_opcode { + MFS_OP_READ = 0, + MFS_OP_FAULT, + MFS_OP_FAROUND, +}; + enum { MFS_MODE_NONE = 0, MFS_MODE_LOCAL, -- 2.25.1