
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC9Q31 -------------------------------- Add cache mode for fd wakeup in epoll_pwait. In epoll_pwait, read data from ready fd into pre-allocated kernel cache buffer. And then, in sys_read, read from cache buffer and copy to user. So, we can async prefetch read data in epoll_pwait. (lock state can not kernel_read()) +--------------CANCEL<------------------------------+ | ^ | | | | | (prefetch not | (kernel_read() done) | (cache buf | start) | (prefetching) | not empty) +--------------> NONE ---->PREFETCH ----> READY <---+ (vfs_read done or not need) Signed-off-by: Yipeng Zou <zouyipeng@huawei.com> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com> --- arch/Kconfig | 16 +++ fs/eventpoll.c | 258 +++++++++++++++++++++++++++++++++++++++++++++ fs/file_table.c | 1 + fs/read_write.c | 6 ++ include/linux/fs.h | 35 ++++++ 5 files changed, 316 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 6dc501a4afb1..26a3a3862cd4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1205,6 +1205,22 @@ config FAST_SYSCALL exception handling path that only considers necessary features such as security, context saving, and recovery. +config XCALL_PREFETCH + bool "Xcall prefetch support" + depends on FAST_SYSCALL + default n + help + This enable xcall prefetch feature. + Xcall prefetch feature implements customized epoll_wait() and + read() system calls, which enable data prefetching. + In high-concurrency connection scenarios, this improves + the parallel execution efficiency of the read() system call + and increases the system's business throughput. + The Xcall prefetch feature is suitable for business scenarios + where the epoll I/O multiplexing mechanism is used, the read() + system call takes up a large proportion of time, and the number + of concurrent connections is large. + config ARCH_SUPPORTS_FAST_IRQ bool diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ce1ea1f452b..dc0f11640eb0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -768,6 +768,249 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +#ifdef CONFIG_XCALL_PREFETCH +#define PREFETCH_ITEM_HASH_BITS 6 +static DEFINE_HASHTABLE(xcall_item_table, PREFETCH_ITEM_HASH_BITS); +static DEFINE_RWLOCK(xcall_table_lock); +static struct workqueue_struct *rc_work; + +static inline bool transition_state(struct prefetch_item *pfi, + enum cache_state old, enum cache_state new) +{ + return atomic_cmpxchg(&pfi->state, old, new) == old; +} + +static struct prefetch_item *find_prefetch_item(struct file *file) +{ + unsigned int hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + struct prefetch_item *found = NULL; + + read_lock(&xcall_table_lock); + hash_for_each_possible(xcall_item_table, found, node, hash) { + if (found->file == file) + break; + } + read_unlock(&xcall_table_lock); + + return found; +} + +static void prefetch_work_fn(struct work_struct *work) +{ + struct prefetch_item *pfi = container_of(work, struct prefetch_item, work); + + if (!transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_PREFETCH)) + return; + + pfi->len = kernel_read(pfi->file, pfi->cache, + PAGE_SIZE, &pfi->file->f_pos); + transition_state(pfi, XCALL_CACHE_PREFETCH, XCALL_CACHE_READY); +} + +static void set_prefetch_numa_cpu(struct prefetch_item *pfi, int fd) +{ + int cpu = smp_processor_id(); + + cpumask_and(&pfi->related_cpus, cpu_cpu_mask(cpu), cpu_online_mask); + pfi->cpu = cpumask_next(fd % cpumask_weight(&pfi->related_cpus), + &pfi->related_cpus); +} + +static struct prefetch_item *alloc_prefetch_item(struct epitem *epi) +{ + struct file *tfile = epi->ffd.file; + struct prefetch_item *pfi; + int fd = epi->ffd.fd; + unsigned int hash; + + pfi = kmalloc(sizeof(struct prefetch_item), GFP_KERNEL); + if (!pfi) + return NULL; + + pfi->cache_pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 0); + if (!pfi->cache_pages) { + kfree(pfi); + return NULL; + } + + pfi->cache = page_address(pfi->cache_pages); + atomic_set(&pfi->state, XCALL_CACHE_NONE); + INIT_WORK(&pfi->work, prefetch_work_fn); + INIT_HLIST_NODE(&pfi->node); + pfi->file = tfile; + pfi->len = 0; + pfi->pos = 0; + set_prefetch_numa_cpu(pfi, fd); + + write_lock(&xcall_table_lock); + hash = hash_64((u64)tfile, PREFETCH_ITEM_HASH_BITS); + hash_add(xcall_item_table, &pfi->node, hash); + write_unlock(&xcall_table_lock); + + return pfi; +} + +extern bool fast_syscall_enabled(void); +void free_prefetch_item(struct file *file) +{ + struct prefetch_item *pfi; + + if (!fast_syscall_enabled()) + return; + + pfi = find_prefetch_item(file); + if (!pfi) + return; + + write_lock(&xcall_table_lock); + if (!hlist_unhashed(&pfi->node)) + hlist_del_init(&pfi->node); + write_unlock(&xcall_table_lock); + __free_pages(pfi->cache_pages, 0); + pfi->cache = NULL; + kfree(pfi); +} + +static int xcall_read(struct prefetch_item *pfi, char __user *buf, size_t count) +{ + ssize_t copy_ret = -1; + ssize_t copy_len = 0; + + while (!transition_state(pfi, XCALL_CACHE_READY, XCALL_CACHE_CANCEL)) { + if (transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_CANCEL)) + goto slow_read; + } + + copy_len = pfi->len; + if (unlikely(copy_len < 0)) + goto slow_read; + + if (copy_len == 0) { + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + return 0; + } + + copy_len = (copy_len >= count) ? count : copy_len; + copy_ret = copy_to_user(buf, (void *)(pfi->cache + pfi->pos), copy_len); + pfi->len -= (copy_len - copy_ret); + if (pfi->len == 0) { + pfi->len = 0; + pfi->pos = 0; + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + } else if (pfi->len > 0) { + pfi->pos += copy_len; + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_READY); + } + + return copy_len - copy_ret; + +slow_read: + pfi->len = 0; + pfi->pos = 0; + cancel_work(&pfi->work); + + return -EAGAIN; +} + +int xcall_read_begin(struct file *file, char __user *buf, size_t count) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return -EAGAIN; + + if (!file) + return -EAGAIN; + + pfi = find_prefetch_item(file); + if (!pfi) + return -EAGAIN; + + return xcall_read(pfi, buf, count); +} + +void xcall_read_end(struct file *file) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + if (!file) + return; + + pfi = find_prefetch_item(file); + if (!pfi) + return; + + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); +} + +static int get_async_prefetch_cpu(struct prefetch_item *pfi) +{ + int cpu; + + if (pfi->cpu != smp_processor_id()) + return pfi->cpu; + + cpu = cpumask_next(pfi->cpu, &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; + return pfi->cpu; +} + +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ + struct prefetch_item *pfi; + int cpu, err; + + if (unlikely(!rc_work)) + return; + + if (!(epi->event.events & EPOLLIN)) + return; + + if (!sock_from_file(epi->ffd.file, &err)) + return; + + if (!(epi->ffd.file->f_mode & FMODE_READ)) + return; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(epi->ffd.file); + if (unlikely(!pfi)) { + pfi = alloc_prefetch_item(epi); + if (unlikely(!pfi)) + return; + } + + if (atomic_read(&pfi->state) != XCALL_CACHE_NONE) + return; + + cpu = get_async_prefetch_cpu(pfi); + queue_work_on(cpu, rc_work, &pfi->work); +} + +static void xcall_cancel_work(struct file *file) +{ + struct prefetch_item *pfi; + + if (!current->xcall_select || + !test_bit(__NR_epoll_pwait, current->xcall_select)) + return; + + pfi = find_prefetch_item(file); + if (pfi) + cancel_work_sync(&pfi->work); +} +#endif + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -782,6 +1025,9 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); +#ifdef CONFIG_XCALL_PREFETCH + xcall_cancel_work(file); +#endif /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); @@ -1751,6 +1997,10 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; +#ifdef CONFIG_XCALL_PREFETCH + ep_prefetch_item_enqueue(ep, epi); +#endif + if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); @@ -2454,6 +2704,14 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); +#ifdef CONFIG_XCALL_PREFETCH + rc_work = alloc_workqueue("eventpoll_rc", 0, 0); + if (!rc_work) + pr_warn("alloc eventpoll_rc workqueue failed.\n"); + + hash_init(xcall_item_table); +#endif + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/file_table.c b/fs/file_table.c index 542f4fddc0a0..24008708a202 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -79,6 +79,7 @@ static void file_free_rcu(struct rcu_head *head) put_cred(f->f_cred); kmem_cache_free(filp_cachep, GET_FILE_WRAP(f)); + free_prefetch_item(f); } static inline void file_free(struct file *f) diff --git a/fs/read_write.c b/fs/read_write.c index da03b3e65cf3..ffe61733d174 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -622,6 +622,11 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; + ret = xcall_read_begin(f.file, buf, count); + if (ret != -EAGAIN) { + fdput_pos(f); + return ret; + } if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { @@ -632,6 +637,7 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); + xcall_read_end(f.file); } return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a0ea6b64c45d..221b4d4e3889 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3750,4 +3750,39 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +#ifdef CONFIG_XCALL_PREFETCH +enum cache_state { + XCALL_CACHE_NONE = 0, + XCALL_CACHE_PREFETCH, + XCALL_CACHE_READY, + XCALL_CACHE_CANCEL +}; + +struct prefetch_item { + struct file *file; + struct work_struct work; + int cpu; + cpumask_t related_cpus; + struct page *cache_pages; + char *cache; + ssize_t len; + /* cache state in epoll_wait */ + atomic_t state; + loff_t pos; + struct hlist_node node; +}; + +int xcall_read_begin(struct file *file, char __user *buf, size_t count); +void xcall_read_end(struct file *file); +void free_prefetch_item(struct file *file); +#else +static inline int xcall_read_begin(struct file *file, char __user *buf, + size_t count) +{ + return -EAGAIN; +} +static inline void xcall_read_end(struct file *file) {} +static inline void free_prefetch_item(struct file *file) {} +#endif + #endif /* _LINUX_FS_H */ -- 2.34.1