
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/IC9Q31 -------------------------------- Support xcall async prefetch for fd wakeup in epoll_pwait. In epoll_pwait, read data from ready fd into pre-allocated kernel cache buffer. And then, in sys_read, read from cache buffer and copy to user. So, we can async prefetch read data in epoll_pwait. (lock state can not kernel_read()) +--------------CANCEL<------------------------------+ | ^ | | | | | (prefetch not | (kernel_read() done) | (cache buf | start) | (prefetching) | not empty) +--------------> NONE ---->PREFETCH ----> READY <---+ (vfs_read done or not need) Signed-off-by: Yipeng Zou <zouyipeng@huawei.com> Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com> --- fs/eventpoll.c | 277 +++++++++++++++++++++++++++++++++++++++++++++ fs/file_table.c | 1 + fs/read_write.c | 10 +- include/linux/fs.h | 35 ++++++ 4 files changed, 322 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ce1ea1f452b..968db82175bf 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -768,6 +768,272 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +#ifdef CONFIG_XCALL_PREFETCH +#include <linux/cpufeature.h> +#include <asm/xcall.h> + +#define XCALL_CACHE_PAGE_ORDER 2 +#define XCALL_CACHE_BUF_SIZE ((1 << XCALL_CACHE_PAGE_ORDER) * PAGE_SIZE) +#define PREFETCH_ITEM_HASH_BITS 6 +static DEFINE_HASHTABLE(xcall_item_table, PREFETCH_ITEM_HASH_BITS); +static DEFINE_RWLOCK(xcall_table_lock); +static struct workqueue_struct *rc_work; + +static inline bool transition_state(struct prefetch_item *pfi, + enum cache_state old, enum cache_state new) +{ + return atomic_cmpxchg(&pfi->state, old, new) == old; +} + +static void xcall_prefetch_init(void) +{ + if (!system_supports_xcall()) + return; + + rc_work = alloc_workqueue("eventpoll_rc", 0, 0); + if (!rc_work) + pr_warn("alloc eventpoll_rc workqueue failed.\n"); + + hash_init(xcall_item_table); +} + +static struct prefetch_item *find_prefetch_item(struct file *file) +{ + unsigned int hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + struct prefetch_item *found = NULL; + + read_lock(&xcall_table_lock); + hash_for_each_possible(xcall_item_table, found, node, hash) { + if (found->file == file) + break; + } + read_unlock(&xcall_table_lock); + + return found; +} + +static void prefetch_work_fn(struct work_struct *work) +{ + struct prefetch_item *pfi = container_of(work, struct prefetch_item, work); + + if (!transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_PREFETCH)) + return; + + pfi->pos = 0; + pfi->len = kernel_read(pfi->file, pfi->cache, + XCALL_CACHE_BUF_SIZE, &pfi->file->f_pos); + transition_state(pfi, XCALL_CACHE_PREFETCH, XCALL_CACHE_READY); +} + +static void set_prefetch_numa_cpu(struct prefetch_item *pfi, int fd) +{ + int cur_cpu = smp_processor_id(); + int cpu; + + cpumask_and(&pfi->related_cpus, cpu_cpu_mask(cur_cpu), cpu_online_mask); + cpu = cpumask_next(fd % cpumask_weight(&pfi->related_cpus), + &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; +} + +static struct prefetch_item *alloc_prefetch_item(struct epitem *epi) +{ + struct file *tfile = epi->ffd.file; + struct prefetch_item *pfi; + int fd = epi->ffd.fd; + unsigned int hash; + + pfi = kmalloc(sizeof(struct prefetch_item), GFP_KERNEL); + if (!pfi) + return NULL; + + pfi->cache_pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + XCALL_CACHE_PAGE_ORDER); + if (!pfi->cache_pages) { + kfree(pfi); + return NULL; + } + + pfi->cache = page_address(pfi->cache_pages); + atomic_set(&pfi->state, XCALL_CACHE_NONE); + INIT_WORK(&pfi->work, prefetch_work_fn); + INIT_HLIST_NODE(&pfi->node); + pfi->file = tfile; + pfi->len = 0; + pfi->pos = 0; + set_prefetch_numa_cpu(pfi, fd); + + hash = hash_64((u64)tfile, PREFETCH_ITEM_HASH_BITS); + write_lock(&xcall_table_lock); + hash_add(xcall_item_table, &pfi->node, hash); + write_unlock(&xcall_table_lock); + + return pfi; +} + +void free_prefetch_item(struct file *file) +{ + struct prefetch_item *pfi = NULL; + struct hlist_node *next; + bool has_pfi = false; + unsigned int hash; + + if (!system_supports_xcall()) + return; + + hash = hash_64((u64)file, PREFETCH_ITEM_HASH_BITS); + write_lock(&xcall_table_lock); + hash_for_each_possible_safe(xcall_item_table, pfi, next, node, hash) { + if (pfi->file == file) { + hlist_del_init(&pfi->node); + has_pfi = true; + break; + } + } + write_unlock(&xcall_table_lock); + if (!has_pfi) + return; + + cancel_work_sync(&pfi->work); + __free_pages(pfi->cache_pages, XCALL_CACHE_PAGE_ORDER); + pfi->cache = NULL; + kfree(pfi); +} + +static int xcall_read(struct prefetch_item *pfi, char __user *buf, size_t count) +{ + ssize_t copy_len = 0; + + /* + * Everytime it does the memcpy on prefetch buffer, it has to keep + * the state of pfi is "CANCEL" to avoid the race on the prefetch + * buffer from both the prefetch thread calling kernel_read() and + * other threads calling copy_to_user(), also avoid race on the + * prefetch file from both the prefetch thread calling kernel_read() + * and other threads calling vfs_read(). + */ + while (!transition_state(pfi, XCALL_CACHE_READY, XCALL_CACHE_CANCEL)) { + /* + * Once the prefetch thread read return error code or prefetch + * has not start, no need to waste CPU on waiting right here, + * it should do a slow vfs_read() to ensure no new arrival data. + */ + if (transition_state(pfi, XCALL_CACHE_NONE, XCALL_CACHE_CANCEL)) + goto slow_read; + } + + copy_len = pfi->len; + if (unlikely(copy_len < 0)) + goto slow_read; + + if (copy_len == 0) { + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + return 0; + } + + copy_len = (copy_len >= count) ? count : copy_len; + copy_len -= copy_to_user(buf, (void *)(pfi->cache + pfi->pos), copy_len); + pfi->len -= copy_len; + pfi->pos += copy_len; + if (pfi->len == 0) + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); + else + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_READY); + + return copy_len; + +slow_read: + pfi->len = 0; + pfi->pos = 0; + cancel_work(&pfi->work); + + return -EAGAIN; +} + +int xcall_read_begin(struct file *file, char __user *buf, size_t count) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xinfo) + return -EAGAIN; + + pfi = find_prefetch_item(file); + if (!pfi) + return -EAGAIN; + + return xcall_read(pfi, buf, count); +} + +void xcall_read_end(struct file *file) +{ + struct prefetch_item *pfi = NULL; + + if (!current->xinfo) + return; + + pfi = find_prefetch_item(file); + if (!pfi) + return; + + transition_state(pfi, XCALL_CACHE_CANCEL, XCALL_CACHE_NONE); +} + +static int get_async_prefetch_cpu(struct prefetch_item *pfi) +{ + int cpu; + + if (pfi->cpu != smp_processor_id()) + return pfi->cpu; + + cpu = cpumask_next(pfi->cpu, &pfi->related_cpus); + if (cpu > cpumask_last(&pfi->related_cpus)) + cpu = cpumask_first(&pfi->related_cpus); + pfi->cpu = cpu; + return pfi->cpu; +} + +static void ep_prefetch_item_enqueue(struct eventpoll *ep, struct epitem *epi) +{ + struct prefetch_item *pfi; + int cpu, err; + + if (unlikely(!rc_work) || !current->xinfo || !current->xinfo->prefetch) + return; + + if (!(epi->event.events & EPOLLIN) || + !sock_from_file(epi->ffd.file, &err) || + !(epi->ffd.file->f_mode & FMODE_READ)) + return; + + pfi = find_prefetch_item(epi->ffd.file); + if (unlikely(!pfi)) { + pfi = alloc_prefetch_item(epi); + if (unlikely(!pfi)) + return; + } + + if (atomic_read(&pfi->state) != XCALL_CACHE_NONE) + return; + + cpu = get_async_prefetch_cpu(pfi); + queue_work_on(cpu, rc_work, &pfi->work); +} + +static void xcall_cancel_work(struct file *file) +{ + struct prefetch_item *pfi; + + if (!current->xinfo) + return; + + pfi = find_prefetch_item(file); + if (pfi) + cancel_work_sync(&pfi->work); +} +#endif + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -782,6 +1048,9 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); +#ifdef CONFIG_XCALL_PREFETCH + xcall_cancel_work(file); +#endif /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); @@ -1751,6 +2020,10 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head if (!revents) continue; +#ifdef CONFIG_XCALL_PREFETCH + ep_prefetch_item_enqueue(ep, epi); +#endif + if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); @@ -2454,6 +2727,10 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); +#ifdef CONFIG_XCALL_PREFETCH + xcall_prefetch_init(); +#endif + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/file_table.c b/fs/file_table.c index 542f4fddc0a0..935fdd668e05 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -301,6 +301,7 @@ static void __fput(struct file *file) * in the file cleanup chain. */ eventpoll_release(file); + free_prefetch_item(file); locks_remove_file(file); ima_file_free(file); diff --git a/fs/read_write.c b/fs/read_write.c index da03b3e65cf3..a850bb1b0a77 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -623,7 +623,13 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ssize_t ret = -EBADF; if (f.file) { - loff_t pos, *ppos = file_ppos(f.file); + loff_t pos, *ppos; + + ret = xcall_read_begin(f.file, buf, count); + if (ret != -EAGAIN) + goto fdput; + + ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; @@ -631,6 +637,8 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; + xcall_read_end(f.file); +fdput: fdput_pos(f); } return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index a0ea6b64c45d..221b4d4e3889 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3750,4 +3750,39 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +#ifdef CONFIG_XCALL_PREFETCH +enum cache_state { + XCALL_CACHE_NONE = 0, + XCALL_CACHE_PREFETCH, + XCALL_CACHE_READY, + XCALL_CACHE_CANCEL +}; + +struct prefetch_item { + struct file *file; + struct work_struct work; + int cpu; + cpumask_t related_cpus; + struct page *cache_pages; + char *cache; + ssize_t len; + /* cache state in epoll_wait */ + atomic_t state; + loff_t pos; + struct hlist_node node; +}; + +int xcall_read_begin(struct file *file, char __user *buf, size_t count); +void xcall_read_end(struct file *file); +void free_prefetch_item(struct file *file); +#else +static inline int xcall_read_begin(struct file *file, char __user *buf, + size_t count) +{ + return -EAGAIN; +} +static inline void xcall_read_end(struct file *file) {} +static inline void free_prefetch_item(struct file *file) {} +#endif + #endif /* _LINUX_FS_H */ -- 2.34.1