
From: Yipeng Zou <zouyipeng@huawei.com> Add cache mode for fd wakeup in epoll_pwait. In epoll_pwait, read data from ready fd into pre-allocated kernel cache buffer. And then, in sys_read, read from cache buffer and copy to user. So, we can async prefetch read data in epoll_pwait. Signed-off-by: Yipeng Zou <zouyipeng@huawei.com> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com> --- fs/eventpoll.c | 229 ++++++++++++++++++++++++++++++ fs/open.c | 4 + fs/read_write.c | 92 +++++++++++- include/linux/fs.h | 31 ++++ include/linux/syscalls.h | 7 + include/uapi/asm-generic/unistd.h | 1 + kernel/sysctl.c | 36 +++++ 7 files changed, 399 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ce1ea1f452b..be34d94d26bd 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -128,6 +128,8 @@ struct nested_calls { spinlock_t lock; }; +static struct workqueue_struct *rc_work; + /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. @@ -768,6 +770,45 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +#ifdef CONFIG_FAST_SYSCALL +#define PREFETCH_ITEM_HASH_BITS 6 +#define PREFETCH_ITEM_TABLE_SIZE (1 << PREFETCH_ITEM_HASH_BITS) +DEFINE_HASHTABLE(xcall_item_table, PREFETCH_ITEM_HASH_BITS); +DEFINE_RAW_SPINLOCK(xcall_table_lock); + +struct prefetch_item *find_prefetch_item(struct file *file) +{ + struct prefetch_item *found = NULL; + unsigned hash = 0; + + hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + raw_spin_lock(&xcall_table_lock); + hash_for_each_possible(xcall_item_table, found, node, hash) { + if (found->f == file) + break; + } + raw_spin_unlock(&xcall_table_lock); + + return found; +} + +void free_prefetch_item(struct file *file) +{ + struct prefetch_item *pfi = find_prefetch_item(file); + if (pfi) { + raw_spin_lock(&xcall_table_lock); + hlist_del_init(&pfi->node); + raw_spin_unlock(&xcall_table_lock); + + if (pfi->cache) { + kfree(pfi->cache); + pfi->cache = NULL; + } + kfree(pfi); + } +} +#endif + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -783,6 +824,15 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) */ ep_unregister_pollwait(ep, epi); +#ifdef CONFIG_FAST_SYSCALL + if (current->xcall_select && + test_bit(__NR_epoll_pwait, current->xcall_select)) { + struct prefetch_item *pfi = find_prefetch_item(file); + if (pfi) + cancel_work_sync(&pfi->work); + } +#endif + /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); list_del_rcu(&epi->fllink); @@ -1191,6 +1241,150 @@ static inline bool chain_epi_lockless(struct epitem *epi) return true; } + +#ifdef CONFIG_FAST_SYSCALL +int max_fd_cache_pages = 1; +static void do_prefetch_item(struct prefetch_item *pfi) +{ + if (pfi && (pfi->state != EPOLL_FILE_CACHE_QUEUED)) + return; + + if (pfi->len > 0) + return; + + pfi->len = kernel_read(pfi->f, pfi->cache, + max_fd_cache_pages * PAGE_SIZE, &pfi->f->f_pos); + pfi->state = EPOLL_FILE_CACHE_READY; +} + +struct cpumask xcall_numa_cpumask[4] __read_mostly; +unsigned long *xcall_numa_cpumask_bits0 = cpumask_bits(&xcall_numa_cpumask[0]); +unsigned long *xcall_numa_cpumask_bits1 = cpumask_bits(&xcall_numa_cpumask[1]); +unsigned long *xcall_numa_cpumask_bits2 = cpumask_bits(&xcall_numa_cpumask[2]); +unsigned long *xcall_numa_cpumask_bits3 = cpumask_bits(&xcall_numa_cpumask[3]); + +#ifdef CONFIG_SYSCTL +static void proc_xcall_update(void) +{ + int i; + + /* Remove impossible cpus to keep sysctl output clean. */ + for (i = 0; i < 4; i++) + cpumask_and(&xcall_numa_cpumask[i], &xcall_numa_cpumask[i], cpu_possible_mask); +} + +int proc_xcall_numa_cpumask(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + // todo: add lock + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (!err && write) + proc_xcall_update(); + + return err; +} +#endif /* CONFIG_SYSCTL */ + +static void prefetch_work_fn(struct work_struct *work) +{ + struct prefetch_item *pfi = container_of(work, struct prefetch_item, work); + + spin_lock(&pfi->pfi_lock); + do_prefetch_item(pfi); + spin_unlock(&pfi->pfi_lock); +} + +static int get_nth_cpu_in_cpumask(const struct cpumask *mask, int n) +{ + int count = 0; + int cpu; + + for_each_cpu(cpu, mask) { + if (count == n) + return cpu; + count++; + } + + return cpumask_first(mask); +} + +static struct prefetch_item *alloc_prefetch_item(struct epitem *epi) +{ + struct file *tfile = epi->ffd.file; + struct prefetch_item *pfi; + int fd = epi->ffd.fd; + int cpu, nid; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return NULL; + + /* Initialization prefetch item */ + pfi = kmalloc(sizeof(struct prefetch_item), GFP_KERNEL); + if (!pfi) + return NULL; + + pfi->cache = kzalloc(max_fd_cache_pages * PAGE_SIZE, GFP_KERNEL); + if (!pfi->cache) { + kfree(pfi); + return NULL; + } + + /* Init Read Cache mode */ + pfi->state = EPOLL_FILE_CACHE_NONE; + INIT_WORK(&pfi->work, prefetch_work_fn); + INIT_HLIST_NODE(&pfi->node); + spin_lock_init(&pfi->pfi_lock); + pfi->fd = fd; + pfi->f = tfile; + pfi->len = 0; + pfi->pos = 0; + cpu = smp_processor_id(); + nid = numa_node_id(); + cpumask_and(&pfi->related_cpus, cpu_cpu_mask(cpu), cpu_online_mask); + if (nid <= 3 && !cpumask_empty(&xcall_numa_cpumask[nid]) && + cpumask_subset(&xcall_numa_cpumask[nid], cpu_cpu_mask(cpu))) + cpumask_and(&pfi->related_cpus, &pfi->related_cpus, &xcall_numa_cpumask[nid]); + pfi->cpu = get_nth_cpu_in_cpumask(&pfi->related_cpus, fd % cpumask_weight(&pfi->related_cpus)); + + raw_spin_lock(&xcall_table_lock); + hash_add(xcall_item_table, &pfi->node, hash_64((u64)tfile, PREFETCH_ITEM_HASH_BITS)); + raw_spin_unlock(&xcall_table_lock); + + return pfi; +} + +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ + struct prefetch_item *pfi = find_prefetch_item(epi->ffd.file); + int t_cpu; + + if (!pfi) { + pfi = alloc_prefetch_item(epi); + if (pfi == NULL) + return; + } + + if (!pfi->cache || !(epi->event.events & EPOLLIN) || + pfi->state != EPOLL_FILE_CACHE_NONE) + return; + + if (pfi->cpu == smp_processor_id()) { + t_cpu = cpumask_next(pfi->cpu, &pfi->related_cpus); + if (t_cpu > cpumask_last(&pfi->related_cpus)) + t_cpu = cpumask_first(&pfi->related_cpus); + } else + t_cpu = pfi->cpu; + + spin_lock(&pfi->pfi_lock); + pfi->state = EPOLL_FILE_CACHE_QUEUED; + queue_work_on(t_cpu, rc_work, &pfi->work); + spin_unlock(&pfi->pfi_lock); +} +#endif + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they @@ -1751,6 +1945,12 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; +#ifdef CONFIG_FAST_SYSCALL + if (current->xcall_select && + test_bit(__NR_epoll_pwait, current->xcall_select)) + ep_prefetch_item_enqueue(ep, epi); +#endif + if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); @@ -2383,6 +2583,26 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, size_t, sigsetsize) { int error; + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + error = set_user_sigmask(sigmask, sigsetsize); + if (error) + return error; + + error = do_epoll_wait(epfd, events, maxevents, timeout); + restore_saved_sigmask_unless(error == -EINTR); + + return error; +} + +#ifdef CONFIG_FAST_SYSCALL +XCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, const sigset_t __user *, sigmask, + size_t, sigsetsize) +{ + int error; /* * If the caller wants a certain signal mask to be set during the wait, @@ -2397,6 +2617,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return error; } +#endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, @@ -2454,6 +2675,14 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); +#ifdef CONFIG_FAST_SYSCALL + rc_work = alloc_workqueue("eventpoll_rc", 0, 0); + if (!rc_work) + return -ENOMEM; + + hash_init(xcall_item_table); +#endif + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/open.c b/fs/open.c index 96de0d3f1a8b..46308348a774 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1287,6 +1287,10 @@ int filp_close(struct file *filp, fl_owner_t id) return 0; } +#ifdef CONFIG_FAST_SYSCALL + free_prefetch_item(filp); +#endif + if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); diff --git a/fs/read_write.c b/fs/read_write.c index da03b3e65cf3..81ca30ff069c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -617,13 +617,103 @@ static inline loff_t *file_ppos(struct file *file) return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } +#ifdef CONFIG_FAST_SYSCALL +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_hit); +EXPORT_PER_CPU_SYMBOL(xcall_cache_hit); + +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_miss); +EXPORT_PER_CPU_SYMBOL(xcall_cache_miss); + +DEFINE_PER_CPU_ALIGNED(unsigned long, xcall_cache_wait); +EXPORT_PER_CPU_SYMBOL(xcall_cache_wait); + +static int xcall_read(struct prefetch_item *pfi, struct fd *f, unsigned int fd, + char __user *buf, size_t count) +{ + ssize_t copy_ret = -1; + ssize_t copy_len; + + if (!spin_trylock(&pfi->pfi_lock)) { + this_cpu_inc(xcall_cache_wait); + spin_lock(&pfi->pfi_lock); + } + + copy_len = pfi->len; + if (pfi->state != EPOLL_FILE_CACHE_READY || copy_len < 0) + goto reset_pfi; + + if (copy_len == 0) { + copy_ret = 0; + goto hit_return; + } + + if (copy_len >= count) + copy_len = count; + + copy_ret = copy_to_user(buf, (void *)(pfi->cache + pfi->pos), copy_len); + pfi->len -= copy_len; + if (pfi->len <= 0) { + pfi->len = 0; + pfi->state = EPOLL_FILE_CACHE_NONE; + } + + pfi->pos += copy_len; + if (pfi->pos >= (max_fd_cache_pages * PAGE_SIZE) || pfi->len == 0) + pfi->pos = 0; + +hit_return: + this_cpu_inc(xcall_cache_hit); + fdput_pos(*f); + spin_unlock(&pfi->pfi_lock); + + /* + * 1. copy_len = 0. + * 2. copy_len > 0 && copy_to_user() works fine. + */ + if (copy_ret == 0) + return copy_len; + else + return -EBADF; + +reset_pfi: + /* Always reset cache state to none */ + pfi->len = 0; + pfi->state = EPOLL_FILE_CACHE_NONE; + this_cpu_inc(xcall_cache_miss); + cancel_work(&pfi->work); + spin_unlock(&pfi->pfi_lock); + + return -EAGAIN; +} +#endif + ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; + loff_t pos, *ppos; +#ifdef CONFIG_FAST_SYSCALL + struct prefetch_item *pfi; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + goto vfs_read; + + if (!f.file) + goto vfs_read; + + pfi = find_prefetch_item(f.file); + if (!pfi || !pfi->cache) + goto vfs_read; + + ret = xcall_read(pfi, &f, fd, buf, count); + if (ret != -EAGAIN) + return ret; +vfs_read: +#endif if (f.file) { - loff_t pos, *ppos = file_ppos(f.file); + ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; diff --git a/include/linux/fs.h b/include/linux/fs.h index a0ea6b64c45d..097b27291044 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -947,6 +947,28 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } +#define EPOLL_FILE_CACHE_NONE 0 +#define EPOLL_FILE_CACHE_QUEUED 1 +#define EPOLL_FILE_CACHE_READY 2 + +struct prefetch_item { + struct file *f; + int fd; + struct work_struct work; + int cpu; + cpumask_t related_cpus; + char *cache; + ssize_t len; + /* cache state in epoll_wait */ + int state; + spinlock_t pfi_lock; + loff_t pos; + struct hlist_node node; +}; + +#define MAX_FD_CACHE 1024 +extern int max_fd_cache_pages; + struct file { union { struct llist_node fu_llist; @@ -3750,4 +3772,13 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +#ifdef CONFIG_FAST_SYSCALL +DECLARE_PER_CPU_ALIGNED(unsigned long, xcall_cache_hit); +DECLARE_PER_CPU_ALIGNED(unsigned long, xcall_cache_miss); +DECLARE_PER_CPU_ALIGNED(unsigned long, xcall_cache_wait); + +struct prefetch_item *find_prefetch_item(struct file *file); +void free_prefetch_item(struct file *file); +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0e379bcd8194..2527c32adad1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -236,6 +236,13 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #define XCALL_DEFINEx(x, sname, ...) \ __XCALL_DEFINEx(x, sname, __VA_ARGS__) + +extern unsigned long *xcall_numa_cpumask_bits0; +extern unsigned long *xcall_numa_cpumask_bits1; +extern unsigned long *xcall_numa_cpumask_bits2; +extern unsigned long *xcall_numa_cpumask_bits3; +int proc_xcall_numa_cpumask(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 9b38861d9ea8..41ed441c3c3a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -98,6 +98,7 @@ __SYSCALL(__NR_epoll_create1, sys_epoll_create1) __SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) #define __NR_epoll_pwait 22 __SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait) +__XCALL_SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait) /* fs/fcntl.c */ #define __NR_dup 23 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b4b36f8a3149..02b55955b725 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2861,6 +2861,42 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &hundred_thousand, }, +#endif +#ifdef CONFIG_FAST_SYSCALL + { + .procname = "xcall_numa0_cpumask", + .data = &xcall_numa_cpumask_bits0, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_xcall_numa_cpumask, + }, + { + .procname = "xcall_numa1_cpumask", + .data = &xcall_numa_cpumask_bits1, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_xcall_numa_cpumask, + }, + { + .procname = "xcall_numa2_cpumask", + .data = &xcall_numa_cpumask_bits2, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_xcall_numa_cpumask, + }, + { + .procname = "xcall_numa3_cpumask", + .data = &xcall_numa_cpumask_bits3, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_xcall_numa_cpumask, + }, + { .procname = "max_xcall_cache_pages", + .data = &max_fd_cache_pages, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, #endif { } }; -- 2.34.1