
From: Jingbo Xu <jefflexu@linux.alibaba.com> anolis inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC6CFR CVE: NA -------------------------------- ANBZ: #9340 Reaahead may be starved by the writeback wave, since the writeback routine sends forced background requests which are enqueued into bg_queue list without considering the max_background limit, while the background requests sent by readahead routine are non-forced and thus throttled by max_background limit. There can be hundreds thousands of WRITE requests queued in bg_queue list prior to READ requests, and thus the asynchronous readahead can be starved from the writeback wave. Fix this by introducing two bg_queue lists and separating WRITE requests from the others. Also make readahead routine send forced background request. Besides also introduce FUSE_SEPARATE_BACKGROUND init flag. When FUSE_SEPARATE_BACKGROUND init flag is set, there are two separate background queues, one for WRITE requests and one for the others. The number of active background requests is also counted separately for these two sorts of requests in this case, and thus there are at maximum max_background in-flight background requests for each sort of requests. Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com> Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/3352 Conflicts: fs/fuse/dev.c fs/fuse/fuse_i.h fs/fuse/inode.c include/uapi/linux/fuse.h [Context conflict.] Signed-off-by: Wang Zhaolong <wangzhaolong1@huawei.com> --- fs/fuse/dev.c | 70 ++++++++++++++++++++++++++++++++++----- fs/fuse/file.c | 5 +++ fs/fuse/fuse_i.h | 21 +++++++++--- fs/fuse/inode.c | 11 +++--- include/uapi/linux/fuse.h | 4 +++ 5 files changed, 93 insertions(+), 18 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 680c467f50b3..92301dbe7f77 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -27,10 +27,12 @@ MODULE_ALIAS("devname:fuse"); /* Ordinary requests have even IDs, while interrupts IDs are odd */ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +#define DEFAULT_BG_QUEUE READ + static struct kmem_cache *fuse_req_cachep; static void end_requests(struct list_head *head); static struct fuse_dev *fuse_get_dev(struct file *file) @@ -252,25 +254,75 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, kfree(forget); spin_unlock(&fiq->lock); } } -static void flush_bg_queue(struct fuse_conn *fc) +static void fuse_add_bg_queue(struct fuse_conn *fc, struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; + if (fc->separate_background) { + if (req->args->opcode == FUSE_WRITE) + list_add_tail(&req->list, &fc->bg_queue[WRITE]); + else + list_add_tail(&req->list, &fc->bg_queue[READ]); + } else { + /* default to one single background queue */ + list_add_tail(&req->list, &fc->bg_queue[DEFAULT_BG_QUEUE]); + } +} - while (fc->active_background < fc->max_background && - !list_empty(&fc->bg_queue)) { - struct fuse_req *req; +static void fuse_dec_active_bg(struct fuse_conn *fc, struct fuse_req *req) +{ + if (fc->separate_background) { + if (req->args->opcode == FUSE_WRITE) + fc->active_background[WRITE]--; + else + fc->active_background[READ]--; + } else { + /* default to one single count */ + fc->active_background[DEFAULT_BG_QUEUE]--; + } +} - req = list_first_entry(&fc->bg_queue, struct fuse_req, list); +/* bg_queue needs to be further flushed when true returned */ +static bool do_flush_bg_queue(struct fuse_conn *fc, unsigned int index, + unsigned int batch) +{ + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_req *req; + unsigned int count = 0; + + while (fc->active_background[index] < fc->max_background && + !list_empty(&fc->bg_queue[index])) { + if (batch && count++ == batch) + return true; + req = list_first_entry(&fc->bg_queue[index], + struct fuse_req, list); list_del(&req->list); - fc->active_background++; + fc->active_background[index]++; spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); queue_request_and_unlock(fiq, req); } + return false; +} + +static void flush_bg_queue(struct fuse_conn *fc) +{ + if (!fc->separate_background) { + do_flush_bg_queue(fc, DEFAULT_BG_QUEUE, 0); + } else { + bool proceed_write = true, proceed_other = true; + + do { + if (proceed_other) + proceed_other = do_flush_bg_queue(fc, READ, + FUSE_DEFAULT_MAX_BACKGROUND); + if (proceed_write) + proceed_write = do_flush_bg_queue(fc, WRITE, + FUSE_DEFAULT_MAX_BACKGROUND); + } while (proceed_other || proceed_write); + } } /* * This function is called when a request is finished. Either a reply * has arrived or it was aborted (and not yet sent) or some error @@ -316,11 +368,11 @@ void fuse_request_end(struct fuse_req *req) if (waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); } fc->num_background--; - fc->active_background--; + fuse_dec_active_bg(fc, req); flush_bg_queue(fc); spin_unlock(&fc->bg_lock); } else { /* Wake up waiter sleeping in request_wait_answer() */ wake_up(&req->waitq); @@ -538,11 +590,11 @@ static bool fuse_request_queue_background(struct fuse_req *req) spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - list_add_tail(&req->list, &fc->bg_queue); + fuse_add_bg_queue(fc, req); flush_bg_queue(fc); queued = true; } spin_unlock(&fc->bg_lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 25d6951ef2c0..7200b176ac79 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -980,10 +980,15 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; + /* force background request to avoid starvation from writeback */ + if (fm->fc->separate_background) { + ap->args.force = true; + ap->args.nocreds = true; + } err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { res = fuse_simple_request(fm, &ap->args); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0428658c499e..c9902fb877cb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -37,10 +37,13 @@ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 /** Maximum of max_pages received in init_out */ #define FUSE_MAX_MAX_PAGES 256 +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -662,15 +665,22 @@ struct fuse_conn { unsigned congestion_threshold; /** Number of requests currently in the background */ unsigned num_background; - /** Number of background requests currently queued for userspace */ - unsigned active_background; + /* + * Number of background requests currently queued for userspace. + * active_background[WRITE] for WRITE requests, and + * active_background[READ] for others. + */ + unsigned active_background[2]; - /** The list of background requests set aside for later queuing */ - struct list_head bg_queue; + /* + * The list of background requests set aside for later queuing. + * bg_queue[WRITE] for WRITE requests, bg_queue[READ] for others. + */ + struct list_head bg_queue[2]; /** Protects: max_background, congestion_threshold, num_background, * active_background, bg_queue, blocked */ spinlock_t bg_lock; @@ -865,10 +875,13 @@ struct fuse_conn { unsigned int no_tmpfile:1; /* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */ unsigned int direct_io_allow_mmap:1; + /* separate background queue for WRITE requests and the others */ + unsigned int separate_background:1; + /* Is statx not implemented by fs? */ unsigned int no_statx:1; /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b67928a773c6..a1669c498f5e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -51,13 +51,10 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); #define FUSE_DEFAULT_BLKSIZE 512 -/** Maximum number of outstanding background requests */ -#define FUSE_DEFAULT_MAX_BACKGROUND 12 - /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) #ifdef CONFIG_BLOCK static struct file_system_type fuseblk_fs_type; @@ -933,11 +930,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); - INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->bg_queue[READ]); + INIT_LIST_HEAD(&fc->bg_queue[WRITE]); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; @@ -1352,10 +1350,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_SEPARATE_BACKGROUND) + fc->separate_background = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } @@ -1399,11 +1399,12 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | + FUSE_SEPARATE_BACKGROUND; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) flags |= FUSE_HAS_INODE_DAX; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d08b99d60f6f..2a84cecf75a1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -419,10 +419,12 @@ struct fuse_file_lock { * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit * of the request ID indicates resend requests + * FUSE_SEPARATE_BACKGROUND: separate background queue for WRITE requests and + * the others */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) #define FUSE_FILE_OPS (1 << 2) #define FUSE_ATOMIC_O_TRUNC (1 << 3) @@ -461,10 +463,12 @@ struct fuse_file_lock { #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) #define FUSE_PASSTHROUGH (1ULL << 37) #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) #define FUSE_HAS_RESEND (1ULL << 39) +#define FUSE_SEPARATE_BACKGROUND (1ULL << 56) +/* The 57th bit is left to FUSE_HAS_RECOVERY */ /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** -- 2.34.3