[PATCH openEuler-5.10 01/17] tcp: Add some stub info for KABI consistency

newer
[PATCH OLK-5.10 0/8] revert bcache...

older
[PATCH OLK-5.10] arm64: Add memmap...

Zheng Zengkai

10 Jan 2022 10 Jan '22

5:33 p.m.

From: Baisong Zhong <zhongbaisong@huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PY1Q CVE: NA -------------------------------- We add stub info in some structures to maintain the consistency of KABI Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com> Reviewed-by: Wei Yongjun <weiyongjun1@huawei.com> Reviewed-by: Yue Haibing <yuehaibing@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> --- include/linux/skbuff.h | 2 ++ include/net/ipv6.h | 2 +- include/net/sock.h | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9e3a454d2377..d485f17ff33a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -36,6 +36,7 @@ #include <linux/splice.h> #include <linux/in6.h> #include <linux/if_packet.h> +#include <linux/llist.h> #include <net/flow.h> #include <net/page_pool.h> #include <linux/kabi.h> @@ -732,6 +733,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; }; union { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index bd1f396cc9c7..c0273ae50296 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -344,9 +344,9 @@ struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; + __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; - __u16 gso_size; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) diff --git a/include/net/sock.h b/include/net/sock.h index 712bb7b09f96..b3d451878640 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -63,6 +63,7 @@ #include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> @@ -405,6 +406,8 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; + struct llist_head defer_list; + #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; -- 2.20.1

Show replies by date

Zheng Zengkai

10 Jan 10 Jan

5:33 p.m.

New subject: [PATCH openEuler-5.10 02/17] bcache: add a framework to perform prefetch

From: Li Ruilin <liruilin4@huawei.com> euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA ------------------------------ Add a framwork to transform io informations to userspace client and process prefetch request sent by userspace client. Create a char device namede "acache" for connecting between kernelspace and userspace. Save informations of all io requests into a buffer and pass them to client when client reads from the device. The prefetch request could be treated as normal io request. As deference, those requests have no need return data back to userspace, and they should not append readahead part. Add two parameters. acache_dev_size is for controlling size of buffer to save io informations. acache_prefetch_workers is for controlling max threads to process prefetch requests. Signed-off-by: Li Ruilin <liruilin4@huawei.com> Reviewed-by: Luan Jianhai <luanjianhai@huawei.com> Reviewed-by: Peng Junyi <pengjunyi1@huawei.com> Acked-by: Xie Xiuqi <xiexiuqi@huawei.com> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com> Reviewed-by: Guangxing Deng <dengguangxing@huawei.com> Reviewed-by: chao song <chao.song@huawei.com> Reviewed-by: chao song <chao.song@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> --- drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 473 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 69 +++++ drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 129 ++++++---- drivers/md/bcache/request.h | 32 +++ drivers/md/bcache/super.c | 4 + include/trace/events/bcache.h | 11 + 8 files changed, 670 insertions(+), 54 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676b8..28f5294dd6cf 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c new file mode 100644 index 000000000000..ff3e120d9619 --- /dev/null +++ b/drivers/md/bcache/acache.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "acache.h" +#include "request.h" + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/cdev.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/circ_buf.h> +#include <linux/list.h> + +#include <trace/events/bcache.h> + +#define DEV_NAME "acache" + +int acache_dev_size = (1024 * 4096 + 4096); + +module_param_named(acache_size, acache_dev_size, int, 0444); +MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte"); + +int acache_prefetch_workers = 1000; + +module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); +MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests"); + +struct prefetch_worker { + struct acache_info s; + struct work_struct work; + struct list_head list; +}; + +struct acache_device { + bool initialized; + + dev_t devno; + struct cdev cdev; + struct class *class; + struct mem_reg *mem_regionp; + + struct acache_info *readbuf; + struct acache_info *writebuf; + + struct acache_circ *acache_info_circ; + + struct workqueue_struct *wq; + struct prefetch_worker *prefetch_workers; + struct list_head prefetch_workers_free; + spinlock_t prefetch_workers_free_list_lock; +} adev; + +#define MAX_TRANSFER_SIZE (1024 * 1024) + +static atomic_t acache_opened_dev = ATOMIC_INIT(0); +static struct acache_metadata metadata; + + +int acache_open(struct inode *inode, struct file *filp) +{ + struct mem_reg *dev; + + int minor = MINOR(inode->i_rdev); + + if (minor >= ACACHE_NR_DEVS) + return -ENODEV; + if (atomic_xchg(&acache_opened_dev, 1)) + return -EPERM; + + dev = &adev.mem_regionp[minor]; + + filp->private_data = dev; + + return 0; +} + +int acache_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&acache_opened_dev); + return 0; +} + +ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf, + size_t size) +{ + unsigned long first, todo, flags; + + spin_lock_irqsave(&circ->lock, flags); + + todo = CIRC_CNT(circ->head, circ->tail, circ->size); + if (todo == 0) { + spin_unlock_irqrestore(&circ->lock, flags); + return 0; + } + if (todo > size / sizeof(struct acache_info)) + todo = size / sizeof(struct acache_info); + + first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size); + if (first > todo) + first = todo; + + memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info)); + if (first < todo) + memcpy(buf + first, circ->data, + (todo - first) * sizeof(struct acache_info)); + circ->tail = (circ->tail + todo) & (circ->size - 1); + + spin_unlock_irqrestore(&circ->lock, flags); + return todo * sizeof(struct acache_info); +} + +static ssize_t acache_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + long ret, cut; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size); + if (ret <= 0) + return ret; + + cut = copy_to_user(buf, adev.readbuf, size); + return ret - cut; +} + +int process_one_request(struct acache_info *item); +static void prefetch_worker_func(struct work_struct *work) +{ + struct prefetch_worker *sw = + container_of(work, struct prefetch_worker, work); + + process_one_request(&sw->s); + spin_lock(&adev.prefetch_workers_free_list_lock); + list_add_tail(&sw->list, &adev.prefetch_workers_free); + spin_unlock(&adev.prefetch_workers_free_list_lock); +} + +static int queue_prefetch_item(struct acache_info *s) +{ + struct prefetch_worker *sw; + + spin_lock(&adev.prefetch_workers_free_list_lock); + sw = list_first_entry_or_null(&adev.prefetch_workers_free, + struct prefetch_worker, list); + if (!sw) { + spin_unlock(&adev.prefetch_workers_free_list_lock); + return -1; + } + list_del_init(&sw->list); + spin_unlock(&adev.prefetch_workers_free_list_lock); + + memcpy(&sw->s, s, sizeof(struct acache_info)); + INIT_WORK(&sw->work, prefetch_worker_func); + queue_work(adev.wq, &sw->work); + return 0; +} + +static ssize_t acache_write(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos) +{ + long cut; + int i; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + cut = copy_from_user(adev.writebuf, buf, size); + for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) { + if (queue_prefetch_item(adev.writebuf + i)) + break; + } + return i * sizeof(struct acache_info); +} + +static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case ACACHE_GET_METADATA: + return copy_to_user((struct acache_metadata __user *)arg, + &metadata, sizeof(struct acache_metadata)); + default: + return -EINVAL; + } +} + +static const struct file_operations acache_fops = { + .owner = THIS_MODULE, + .read = acache_read, + .write = acache_write, + .open = acache_open, + .release = acache_release, + .unlocked_ioctl = acache_ioctl, +}; + +void save_circ_item(struct acache_info *data) +{ + unsigned long flags; + struct acache_circ *circ = adev.acache_info_circ; + + spin_lock_irqsave(&circ->lock, flags); + if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) { + memcpy(&circ->data[circ->head], data, sizeof(struct acache_info)); + circ->head = (circ->head + 1) & (circ->size - 1); + } else { + pr_debug("ringbuffer is full; discard new request.\n"); + } + spin_unlock_irqrestore(&circ->lock, flags); +} + +void init_acache_circ(struct acache_circ **circ, void *startaddr) +{ + *circ = (struct acache_circ *)startaddr; + (*circ)->head = 0; + (*circ)->tail = 0; + (*circ)->size = ACACHE_CIRC_SIZE; + spin_lock_init(&(*circ)->lock); +} + +static void acache_free_mem(void) +{ + int i; + + for (i = 0; i < ACACHE_NR_DEVS; i++) + vfree(adev.mem_regionp[i].data); + + if (adev.readbuf) { + vfree(adev.readbuf); + adev.readbuf = NULL; + } + if (adev.writebuf) { + vfree(adev.writebuf); + adev.writebuf = NULL; + } + + kfree(adev.prefetch_workers); + adev.prefetch_workers = NULL; +} + +int acache_prefetch_init(struct acache_device *adev) +{ + int i; + + if (acache_prefetch_workers <= 0) { + pr_err("acache_dev_size should not be less than zero\n"); + return -1; + } + adev->prefetch_workers = kmalloc_array(acache_prefetch_workers, + sizeof(struct prefetch_worker), + GFP_KERNEL); + if (!adev->prefetch_workers) + goto fail_prefetch_workers_alloc; + + INIT_LIST_HEAD(&adev->prefetch_workers_free); + spin_lock_init(&adev->prefetch_workers_free_list_lock); + for (i = 0; i < acache_prefetch_workers; i++) { + spin_lock(&adev->prefetch_workers_free_list_lock); + list_add_tail(&adev->prefetch_workers[i].list, + &adev->prefetch_workers_free); + spin_unlock(&adev->prefetch_workers_free_list_lock); + } + + adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0); + if (!adev->wq) + goto fail_workqueue_alloc; + + return 0; + +fail_workqueue_alloc: + kfree(adev->prefetch_workers); + adev->prefetch_workers = NULL; +fail_prefetch_workers_alloc: + if (adev->wq) + destroy_workqueue(adev->wq); + return -1; +} + +int acache_dev_init(void) +{ + int ret; + int i; + int major; + struct device *dev; + + major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); + if (major < 0) { + pr_err("failed to allocate chrdev region: %d\n", major); + return major; + goto fail_allocdev; + } + + adev.class = class_create(THIS_MODULE, DEV_NAME); + if (IS_ERR(adev.class)) { + pr_err("failed to create acache class\n"); + ret = -1; + goto fail_class; + } + + if (acache_dev_size < PAGE_SIZE) { + pr_err("acache_dev_size should not be less than PAGE_SIZE\n"); + ret = -1; + goto fail_dev_add; + } + metadata.devsize = acache_dev_size; + metadata.magic = ACACHE_MAGIC; + metadata.conntype = ACACHE_READWRITE_CONN; + cdev_init(&adev.cdev, &acache_fops); + adev.cdev.owner = THIS_MODULE; + + ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS); + if (ret < 0) { + pr_err("failed to add cdev\n"); + goto fail_dev_add; + } + + dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME); + if (IS_ERR(dev)) { + pr_err("Could not create device\n"); + ret = -1; + goto fail_device; + } + + adev.readbuf = vmalloc(MAX_TRANSFER_SIZE); + adev.writebuf = vmalloc(MAX_TRANSFER_SIZE); + if (!adev.readbuf || !adev.writebuf) { + ret = -ENOMEM; + goto fail_malloc; + } + + adev.initialized = true; + adev.mem_regionp = + kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL); + if (!adev.mem_regionp) { + ret = -ENOMEM; + goto fail_malloc; + } + memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS); + + for (i = 0; i < ACACHE_NR_DEVS; i++) { + adev.mem_regionp[i].size = ACACHE_DEV_SIZE; + adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE); + if (!adev.mem_regionp[i].data) { + ret = -ENOMEM; + goto fail_memregion_data_malloc; + } + memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE); + } + + init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data); + if (acache_prefetch_init(&adev)) + goto fail_prefetch_init; + + return 0; + +fail_prefetch_init: +fail_memregion_data_malloc: + acache_free_mem(); +fail_malloc: + device_destroy(adev.class, adev.devno); +fail_device: + cdev_del(&adev.cdev); +fail_dev_add: + class_destroy(adev.class); +fail_class: + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); +fail_allocdev: + return ret; +} + +void acache_dev_exit(void) +{ + if (!adev.initialized) + return; + + if (adev.wq) { + flush_workqueue(adev.wq); + destroy_workqueue(adev.wq); + } + device_destroy(adev.class, adev.devno); + cdev_del(&adev.cdev); + acache_free_mem(); + kfree(adev.mem_regionp); + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); + class_destroy(adev.class); + kfree(adev.prefetch_workers); +} + +struct cached_dev *get_cached_device_by_dev(dev_t dev) +{ + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev->bd_dev == dev && cached_dev_get(dc)) + return dc; + + return NULL; +} + +struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item) +{ + struct bio *bio; + uint64_t offset = item->offset + dc->sb.data_offset; + + if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) { + pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %llx\n", + MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev), + offset + (item->length >> 9), + get_capacity(dc->bdev->bd_disk)); + return NULL; + } + + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split); + if (!bio) { + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL); + if (!bio) + return NULL; + } + + bio_set_dev(bio, dc->bdev); + bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset; + bio->bi_iter.bi_size = (item->length >> 9) << 9; + + bch_bio_map(bio, NULL); + if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO)) + goto out_put; + + return bio; +out_put: + bio_put(bio); + return NULL; +} + +int process_one_request(struct acache_info *item) +{ + struct cached_dev *dc; + struct bio *cache_bio; + struct search *s; + + dc = get_cached_device_by_dev(item->dev); + if (dc == NULL) + return -1; + cache_bio = get_bio_by_item(dc, item); + if (cache_bio == NULL) { + pr_err("acache: failed to alloc bio for prefetch\n"); + goto put_dev; + } + + s = search_alloc(cache_bio, &dc->disk, true); + + trace_bcache_prefetch_request(&dc->disk, cache_bio); + cached_dev_read(dc, s); + return 0; + +put_dev: + cached_dev_put(dc); + return -1; +} + diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h new file mode 100644 index 000000000000..dea6e8cb0a05 --- /dev/null +++ b/drivers/md/bcache/acache.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ACHACHE_INTERFACE_H_ +#define _ACHACHE_INTERFACE_H_ + +#define ACACHE_NR_DEVS 1 + +#define RING_SIZE + +#include "bcache.h" + +struct mem_reg { + char *data; + unsigned long size; +}; + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + dev_t dev; + int type; +}; + +enum acache_info_type { + ACACHE_INFO_READ = 0, + ACACHE_INFO_WRITE, + ACACHE_INFO_CACHE_INSERT, + ACACHE_INFO_LATENCY, +}; + +struct acache_circ { + spinlock_t lock; + int tail; + int head; + int size; + int item_size; + struct acache_info data[0]; +}; + +struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; +}; + +#define ACACHE_DEV_SIZE acache_dev_size +#define ACACHE_MAGIC 2 + +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_READWRITE_CONN = 2, +}; + +#define ACACHE_CIRC_SIZE \ + ({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \ + int bits = 0; \ + while (i > 0) {i >>= 1; bits++; } \ + 1 << (bits - 1); }) + + +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + +int acache_dev_init(void); +void acache_dev_exit(void); +struct acache_info *fetch_circ_item(struct acache_circ *circ); +void save_circ_item(struct acache_info *data); + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba..96951e638f17 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2122,12 +2122,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, BUG_ON(bkey_cmp(k, &b->key) > 0); status = bch_btree_insert_key(&b->keys, k, replace_key); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert"); - trace_bcache_btree_insert_key(b, k, replace_key != NULL, - status); return true; } else return false; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..5a64afd56b97 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -8,11 +8,13 @@ */ #include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "request.h" #include "writeback.h" +#include <linux/time.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> @@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl) void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct acache_info msg; trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass); + msg.offset = op->bio->bi_iter.bi_sector; + msg.length = op->bio->bi_iter.bi_size; + msg.type = ACACHE_INFO_CACHE_INSERT; + msg.dev = bio_dev(op->bio); + msg.start_time = ktime_get_ns(); + save_circ_item(&msg); + bch_keylist_init(&op->insert_keys); bio_get(op->bio); bch_data_insert_start(cl); @@ -460,28 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) /* Cache lookup */ -struct search { - /* Stack frame for bio_complete */ - struct closure cl; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - struct bcache_device *d; - - unsigned int insert_bio_sectors; - unsigned int recoverable:1; - unsigned int write:1; - unsigned int read_dirty_data:1; - unsigned int cache_missed:1; - - struct hd_struct *part; - unsigned long start_time; - - struct btree_op op; - struct data_insert_op iop; -}; - static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -540,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return MAP_CONTINUE; /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; @@ -557,21 +546,25 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); - n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; + if (!s->prefetch) { + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; - /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ - __bch_submit_bbio(n, b->c); + __bch_submit_bbio(n, b->c); + } else { + bio_put(n); + } return n == bio ? MAP_DONE : MAP_CONTINUE; } @@ -674,7 +667,12 @@ static void bio_complete(struct search *s) trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; - bio_endio(s->orig_bio); + if (s->prefetch) { + bio_free_pages(s->orig_bio); + bio_put(s->orig_bio); + } else { + bio_endio(s->orig_bio); + } s->orig_bio = NULL; } } @@ -699,7 +697,7 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); } -static void search_free(struct closure *cl) +void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); @@ -713,8 +711,8 @@ static void search_free(struct closure *cl) mempool_free(s, &s->iop.c->search); } -static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) +struct search *search_alloc(struct bio *bio, + struct bcache_device *d, bool prefetch) { struct search *s; @@ -733,6 +731,7 @@ static inline struct search *search_alloc(struct bio *bio, s->read_dirty_data = 0; /* Count on the bcache device */ s->start_time = part_start_io_acct(d->disk, &s->part, bio); + s->prefetch = prefetch; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -836,17 +835,22 @@ static void cached_dev_read_done(struct closure *cl) s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); - bio_copy_data(s->cache_miss, s->iop.bio); + if (!s->prefetch) + bio_copy_data(s->cache_miss, s->iop.bio); + else + trace_bcache_prefetch_cache_miss(s->iop.bio); bio_put(s->cache_miss); s->cache_miss = NULL; + } if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); closure_get(&dc->disk.cl); - bio_complete(s); + if (!s->prefetch) + bio_complete(s); if (s->iop.bio && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { @@ -862,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - bch_mark_cache_accounting(s->iop.c, s->d, + if (s->prefetch) + pr_debug("prefetch request; do not count cache_missed"); + else + bch_mark_cache_accounting(s->iop.c, s->d, !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass); + if (!s->prefetch && !s->iop.status) { + s->smp.type = ACACHE_INFO_LATENCY; + s->smp.start_time = ktime_get_ns() - s->smp.start_time; + save_circ_item(&s->smp); + } + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc)) @@ -891,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, } if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + !(bio->bi_opf & (REQ_META|REQ_PRIO) ) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && + !s->prefetch) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio)); @@ -943,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; - /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); + if (!s->prefetch) { + miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + } else { + bio_put(miss); + } return ret; } -static void cached_dev_read(struct cached_dev *dc, struct search *s) +void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1197,7 +1215,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, false); trace_bcache_request_start(s->d, bio); if (!bio->bi_iter.bi_size) { @@ -1211,6 +1229,15 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else { s->iop.bypass = check_should_bypass(dc, bio); + if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { + s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; + s->smp.length = bio->bi_iter.bi_size; + s->smp.type = rw; + s->smp.dev = dc->bdev->bd_dev; + s->smp.start_time = ktime_get_ns(); + save_circ_item(&s->smp); + } + if (rw) cached_dev_write(dc, s); else @@ -1281,7 +1308,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - s = search_alloc(bio, d); + s = search_alloc(bio, d, false); cl = &s->cl; bio = &s->bio.bio; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 82b38366a95d..21678037d215 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ +#include "btree.h" +#include "acache.h" struct data_insert_op { struct closure cl; @@ -44,4 +46,34 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio); extern struct kmem_cache *bch_search_cache; +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; + + struct hd_struct *part; + unsigned long start_time; + /* for prefetch, we do not need copy data to bio */ + bool prefetch; + struct list_head list_node; + wait_queue_head_t wqh; + struct acache_info smp; + + struct btree_op op; + struct data_insert_op iop; +}; + +void search_free(struct closure *cl); +struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch); +void cached_dev_read(struct cached_dev *dc, struct search *s); #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 81f1cc5b3499..a849fc51cd88 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -8,6 +8,7 @@ */ #include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "extents.h" @@ -2846,6 +2847,7 @@ static void bcache_exit(void) if (bcache_major) unregister_blkdev(bcache_major, "bcache"); + acache_dev_exit(); unregister_reboot_notifier(&reboot); mutex_destroy(&bch_register_lock); } @@ -2932,6 +2934,8 @@ static int __init bcache_init(void) bch_debug_init(); closure_debug_init(); + if (acache_dev_init()) + goto err; bcache_is_reboot = false; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..f7be8c6e7cff 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("bucket %zu", __entry->bucket) ); +/* acache.c */ +DEFINE_EVENT(bcache_request, bcache_prefetch_request, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */ DEFINE_EVENT(bcache_request, bcache_request_start, @@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_ARGS(bio) ); +DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + TRACE_EVENT(bcache_read, TP_PROTO(struct bio *bio, bool hit, bool bypass), TP_ARGS(bio, hit, bypass), -- 2.20.1

Xie XiuQi

11 Jan 11 Jan

10:55 a.m.

New subject: [PATCH openEuler-5.10 02/17] bcache: add a framework to perform prefetch

Hi zengkai, jianhai，ruilin, As discussed earlier, this series about bcache has a major conflict with mainline backporting, so revert it now. You can adapt them later. A better suggestion, push them to upstream community. -- Thanks, Xie XiuQi On 2022/1/10 17:33, Zheng Zengkai wrote:

...

From: Li Ruilin <liruilin4@huawei.com>

euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LOJ6 CVE: NA

------------------------------

Add a framwork to transform io informations to userspace client and process prefetch request sent by userspace client. Create a char device namede "acache" for connecting between kernelspace and userspace. Save informations of all io requests into a buffer and pass them to client when client reads from the device.

The prefetch request could be treated as normal io request. As deference, those requests have no need return data back to userspace, and they should not append readahead part.

Add two parameters. acache_dev_size is for controlling size of buffer to save io informations. acache_prefetch_workers is for controlling max threads to process prefetch requests.

Signed-off-by: Li Ruilin <liruilin4@huawei.com> Reviewed-by: Luan Jianhai <luanjianhai@huawei.com> Reviewed-by: Peng Junyi <pengjunyi1@huawei.com> Acked-by: Xie Xiuqi <xiexiuqi@huawei.com> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com> Reviewed-by: Guangxing Deng <dengguangxing@huawei.com> Reviewed-by: chao song <chao.song@huawei.com> Reviewed-by: chao song <chao.song@huawei.com> Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com> --- drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 473 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 69 +++++ drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 129 ++++++---- drivers/md/bcache/request.h | 32 +++ drivers/md/bcache/super.c | 4 + include/trace/events/bcache.h | 11 + 8 files changed, 670 insertions(+), 54 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h

diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e59676b8..28f5294dd6cf 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_BCACHE) += bcache.o

bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c new file mode 100644 index 000000000000..ff3e120d9619 --- /dev/null +++ b/drivers/md/bcache/acache.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "acache.h" +#include "request.h" + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/cdev.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/circ_buf.h> +#include <linux/list.h> + +#include <trace/events/bcache.h> + +#define DEV_NAME "acache" + +int acache_dev_size = (1024 * 4096 + 4096); + +module_param_named(acache_size, acache_dev_size, int, 0444); +MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte"); + +int acache_prefetch_workers = 1000; + +module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); +MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests"); + +struct prefetch_worker { + struct acache_info s; + struct work_struct work; + struct list_head list; +}; + +struct acache_device { + bool initialized; + + dev_t devno; + struct cdev cdev; + struct class *class; + struct mem_reg *mem_regionp; + + struct acache_info *readbuf; + struct acache_info *writebuf; + + struct acache_circ *acache_info_circ; + + struct workqueue_struct *wq; + struct prefetch_worker *prefetch_workers; + struct list_head prefetch_workers_free; + spinlock_t prefetch_workers_free_list_lock; +} adev; + +#define MAX_TRANSFER_SIZE (1024 * 1024) + +static atomic_t acache_opened_dev = ATOMIC_INIT(0); +static struct acache_metadata metadata; + + +int acache_open(struct inode *inode, struct file *filp) +{ + struct mem_reg *dev; + + int minor = MINOR(inode->i_rdev); + + if (minor >= ACACHE_NR_DEVS) + return -ENODEV; + if (atomic_xchg(&acache_opened_dev, 1)) + return -EPERM; + + dev = &adev.mem_regionp[minor]; + + filp->private_data = dev; + + return 0; +} + +int acache_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&acache_opened_dev); + return 0; +} + +ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf, + size_t size) +{ + unsigned long first, todo, flags; + + spin_lock_irqsave(&circ->lock, flags); + + todo = CIRC_CNT(circ->head, circ->tail, circ->size); + if (todo == 0) { + spin_unlock_irqrestore(&circ->lock, flags); + return 0; + } + if (todo > size / sizeof(struct acache_info)) + todo = size / sizeof(struct acache_info); + + first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size); + if (first > todo) + first = todo; + + memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info)); + if (first < todo) + memcpy(buf + first, circ->data, + (todo - first) * sizeof(struct acache_info)); + circ->tail = (circ->tail + todo) & (circ->size - 1); + + spin_unlock_irqrestore(&circ->lock, flags); + return todo * sizeof(struct acache_info); +} + +static ssize_t acache_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + long ret, cut; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size); + if (ret <= 0) + return ret; + + cut = copy_to_user(buf, adev.readbuf, size); + return ret - cut; +} + +int process_one_request(struct acache_info *item); +static void prefetch_worker_func(struct work_struct *work) +{ + struct prefetch_worker *sw = + container_of(work, struct prefetch_worker, work); + + process_one_request(&sw->s); + spin_lock(&adev.prefetch_workers_free_list_lock); + list_add_tail(&sw->list, &adev.prefetch_workers_free); + spin_unlock(&adev.prefetch_workers_free_list_lock); +} + +static int queue_prefetch_item(struct acache_info *s) +{ + struct prefetch_worker *sw; + + spin_lock(&adev.prefetch_workers_free_list_lock); + sw = list_first_entry_or_null(&adev.prefetch_workers_free, + struct prefetch_worker, list); + if (!sw) { + spin_unlock(&adev.prefetch_workers_free_list_lock); + return -1; + } + list_del_init(&sw->list); + spin_unlock(&adev.prefetch_workers_free_list_lock); + + memcpy(&sw->s, s, sizeof(struct acache_info)); + INIT_WORK(&sw->work, prefetch_worker_func); + queue_work(adev.wq, &sw->work); + return 0; +} + +static ssize_t acache_write(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos) +{ + long cut; + int i; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + cut = copy_from_user(adev.writebuf, buf, size); + for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) { + if (queue_prefetch_item(adev.writebuf + i)) + break; + } + return i * sizeof(struct acache_info); +} + +static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case ACACHE_GET_METADATA: + return copy_to_user((struct acache_metadata __user *)arg, + &metadata, sizeof(struct acache_metadata)); + default: + return -EINVAL; + } +} + +static const struct file_operations acache_fops = { + .owner = THIS_MODULE, + .read = acache_read, + .write = acache_write, + .open = acache_open, + .release = acache_release, + .unlocked_ioctl = acache_ioctl, +}; + +void save_circ_item(struct acache_info *data) +{ + unsigned long flags; + struct acache_circ *circ = adev.acache_info_circ; + + spin_lock_irqsave(&circ->lock, flags); + if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) { + memcpy(&circ->data[circ->head], data, sizeof(struct acache_info)); + circ->head = (circ->head + 1) & (circ->size - 1); + } else { + pr_debug("ringbuffer is full; discard new request.\n"); + } + spin_unlock_irqrestore(&circ->lock, flags); +} + +void init_acache_circ(struct acache_circ **circ, void *startaddr) +{ + *circ = (struct acache_circ *)startaddr; + (*circ)->head = 0; + (*circ)->tail = 0; + (*circ)->size = ACACHE_CIRC_SIZE; + spin_lock_init(&(*circ)->lock); +} + +static void acache_free_mem(void) +{ + int i; + + for (i = 0; i < ACACHE_NR_DEVS; i++) + vfree(adev.mem_regionp[i].data); + + if (adev.readbuf) { + vfree(adev.readbuf); + adev.readbuf = NULL; + } + if (adev.writebuf) { + vfree(adev.writebuf); + adev.writebuf = NULL; + } + + kfree(adev.prefetch_workers); + adev.prefetch_workers = NULL; +} + +int acache_prefetch_init(struct acache_device *adev) +{ + int i; + + if (acache_prefetch_workers <= 0) { + pr_err("acache_dev_size should not be less than zero\n"); + return -1; + } + adev->prefetch_workers = kmalloc_array(acache_prefetch_workers, + sizeof(struct prefetch_worker), + GFP_KERNEL); + if (!adev->prefetch_workers) + goto fail_prefetch_workers_alloc; + + INIT_LIST_HEAD(&adev->prefetch_workers_free); + spin_lock_init(&adev->prefetch_workers_free_list_lock); + for (i = 0; i < acache_prefetch_workers; i++) { + spin_lock(&adev->prefetch_workers_free_list_lock); + list_add_tail(&adev->prefetch_workers[i].list, + &adev->prefetch_workers_free); + spin_unlock(&adev->prefetch_workers_free_list_lock); + } + + adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0); + if (!adev->wq) + goto fail_workqueue_alloc; + + return 0; + +fail_workqueue_alloc: + kfree(adev->prefetch_workers); + adev->prefetch_workers = NULL; +fail_prefetch_workers_alloc: + if (adev->wq) + destroy_workqueue(adev->wq); + return -1; +} + +int acache_dev_init(void) +{ + int ret; + int i; + int major; + struct device *dev; + + major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); + if (major < 0) { + pr_err("failed to allocate chrdev region: %d\n", major); + return major; + goto fail_allocdev; + } + + adev.class = class_create(THIS_MODULE, DEV_NAME); + if (IS_ERR(adev.class)) { + pr_err("failed to create acache class\n"); + ret = -1; + goto fail_class; + } + + if (acache_dev_size < PAGE_SIZE) { + pr_err("acache_dev_size should not be less than PAGE_SIZE\n"); + ret = -1; + goto fail_dev_add; + } + metadata.devsize = acache_dev_size; + metadata.magic = ACACHE_MAGIC; + metadata.conntype = ACACHE_READWRITE_CONN; + cdev_init(&adev.cdev, &acache_fops); + adev.cdev.owner = THIS_MODULE; + + ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS); + if (ret < 0) { + pr_err("failed to add cdev\n"); + goto fail_dev_add; + } + + dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME); + if (IS_ERR(dev)) { + pr_err("Could not create device\n"); + ret = -1; + goto fail_device; + } + + adev.readbuf = vmalloc(MAX_TRANSFER_SIZE); + adev.writebuf = vmalloc(MAX_TRANSFER_SIZE); + if (!adev.readbuf || !adev.writebuf) { + ret = -ENOMEM; + goto fail_malloc; + } + + adev.initialized = true; + adev.mem_regionp = + kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL); + if (!adev.mem_regionp) { + ret = -ENOMEM; + goto fail_malloc; + } + memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS); + + for (i = 0; i < ACACHE_NR_DEVS; i++) { + adev.mem_regionp[i].size = ACACHE_DEV_SIZE; + adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE); + if (!adev.mem_regionp[i].data) { + ret = -ENOMEM; + goto fail_memregion_data_malloc; + } + memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE); + } + + init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data); + if (acache_prefetch_init(&adev)) + goto fail_prefetch_init; + + return 0; + +fail_prefetch_init: +fail_memregion_data_malloc: + acache_free_mem(); +fail_malloc: + device_destroy(adev.class, adev.devno); +fail_device: + cdev_del(&adev.cdev); +fail_dev_add: + class_destroy(adev.class); +fail_class: + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); +fail_allocdev: + return ret; +} + +void acache_dev_exit(void) +{ + if (!adev.initialized) + return; + + if (adev.wq) { + flush_workqueue(adev.wq); + destroy_workqueue(adev.wq); + } + device_destroy(adev.class, adev.devno); + cdev_del(&adev.cdev); + acache_free_mem(); + kfree(adev.mem_regionp); + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); + class_destroy(adev.class); + kfree(adev.prefetch_workers); +} + +struct cached_dev *get_cached_device_by_dev(dev_t dev) +{ + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev->bd_dev == dev && cached_dev_get(dc)) + return dc; + + return NULL; +} + +struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item) +{ + struct bio *bio; + uint64_t offset = item->offset + dc->sb.data_offset; + + if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) { + pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %llx\n", + MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev), + offset + (item->length >> 9), + get_capacity(dc->bdev->bd_disk)); + return NULL; + } + + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split); + if (!bio) { + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL); + if (!bio) + return NULL; + } + + bio_set_dev(bio, dc->bdev); + bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset; + bio->bi_iter.bi_size = (item->length >> 9) << 9; + + bch_bio_map(bio, NULL); + if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO)) + goto out_put; + + return bio; +out_put: + bio_put(bio); + return NULL; +} + +int process_one_request(struct acache_info *item) +{ + struct cached_dev *dc; + struct bio *cache_bio; + struct search *s; + + dc = get_cached_device_by_dev(item->dev); + if (dc == NULL) + return -1; + cache_bio = get_bio_by_item(dc, item); + if (cache_bio == NULL) { + pr_err("acache: failed to alloc bio for prefetch\n"); + goto put_dev; + } + + s = search_alloc(cache_bio, &dc->disk, true); + + trace_bcache_prefetch_request(&dc->disk, cache_bio); + cached_dev_read(dc, s); + return 0; + +put_dev: + cached_dev_put(dc); + return -1; +} + diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h new file mode 100644 index 000000000000..dea6e8cb0a05 --- /dev/null +++ b/drivers/md/bcache/acache.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ACHACHE_INTERFACE_H_ +#define _ACHACHE_INTERFACE_H_ + +#define ACACHE_NR_DEVS 1 + +#define RING_SIZE + +#include "bcache.h" + +struct mem_reg { + char *data; + unsigned long size; +}; + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + dev_t dev; + int type; +}; + +enum acache_info_type { + ACACHE_INFO_READ = 0, + ACACHE_INFO_WRITE, + ACACHE_INFO_CACHE_INSERT, + ACACHE_INFO_LATENCY, +}; + +struct acache_circ { + spinlock_t lock; + int tail; + int head; + int size; + int item_size; + struct acache_info data[0]; +}; + +struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; +}; + +#define ACACHE_DEV_SIZE acache_dev_size +#define ACACHE_MAGIC 2 + +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_READWRITE_CONN = 2, +}; + +#define ACACHE_CIRC_SIZE \ + ({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \ + int bits = 0; \ + while (i > 0) {i >>= 1; bits++; } \ + 1 << (bits - 1); }) + + +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + +int acache_dev_init(void); +void acache_dev_exit(void); +struct acache_info *fetch_circ_item(struct acache_circ *circ); +void save_circ_item(struct acache_info *data); + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba..96951e638f17 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2122,12 +2122,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, BUG_ON(bkey_cmp(k, &b->key) > 0);

status = bch_btree_insert_key(&b->keys, k, replace_key); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert");

- trace_bcache_btree_insert_key(b, k, replace_key != NULL, - status); return true; } else return false; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..5a64afd56b97 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -8,11 +8,13 @@ */

#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "request.h" #include "writeback.h"

+#include <linux/time.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> @@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl) void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct acache_info msg;

trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass);

+ msg.offset = op->bio->bi_iter.bi_sector; + msg.length = op->bio->bi_iter.bi_size; + msg.type = ACACHE_INFO_CACHE_INSERT; + msg.dev = bio_dev(op->bio); + msg.start_time = ktime_get_ns(); + save_circ_item(&msg); + bch_keylist_init(&op->insert_keys); bio_get(op->bio); bch_data_insert_start(cl); @@ -460,28 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)

/* Cache lookup */

-struct search { - /* Stack frame for bio_complete */ - struct closure cl; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - struct bcache_device *d; - - unsigned int insert_bio_sectors; - unsigned int recoverable:1; - unsigned int write:1; - unsigned int read_dirty_data:1; - unsigned int cache_missed:1; - - struct hd_struct *part; - unsigned long start_time; - - struct btree_op op; - struct data_insert_op iop; -}; - static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -540,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return MAP_CONTINUE;

/* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0;

PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; @@ -557,21 +546,25 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);

- n->bi_end_io = bch_cache_read_endio; - n->bi_private = &s->cl; + if (!s->prefetch) { + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl;

- /* - * The bucket we're reading from might be reused while our bio - * is in flight, and we could then end up reading the wrong - * data. - * - * We guard against this by checking (in cache_read_endio()) if - * the pointer is stale again; if so, we treat it as an error - * and reread from the backing device (but we don't pass that - * error up anywhere). - */ + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */

- __bch_submit_bbio(n, b->c); + __bch_submit_bbio(n, b->c); + } else { + bio_put(n); + } return n == bio ? MAP_DONE : MAP_CONTINUE; }

@@ -674,7 +667,12 @@ static void bio_complete(struct search *s)

trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; - bio_endio(s->orig_bio); + if (s->prefetch) { + bio_free_pages(s->orig_bio); + bio_put(s->orig_bio); + } else { + bio_endio(s->orig_bio); + } s->orig_bio = NULL; } } @@ -699,7 +697,7 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); }

-static void search_free(struct closure *cl) +void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);

@@ -713,8 +711,8 @@ static void search_free(struct closure *cl) mempool_free(s, &s->iop.c->search); }

-static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) +struct search *search_alloc(struct bio *bio, + struct bcache_device *d, bool prefetch) { struct search *s;

@@ -733,6 +731,7 @@ static inline struct search *search_alloc(struct bio *bio, s->read_dirty_data = 0; /* Count on the bcache device */ s->start_time = part_start_io_acct(d->disk, &s->part, bio); + s->prefetch = prefetch; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -836,17 +835,22 @@ static void cached_dev_read_done(struct closure *cl) s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL);

- bio_copy_data(s->cache_miss, s->iop.bio); + if (!s->prefetch) + bio_copy_data(s->cache_miss, s->iop.bio); + else + trace_bcache_prefetch_cache_miss(s->iop.bio);

bio_put(s->cache_miss); s->cache_miss = NULL; + }

if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio);

closure_get(&dc->disk.cl); - bio_complete(s); + if (!s->prefetch) + bio_complete(s);

if (s->iop.bio && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { @@ -862,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);

- bch_mark_cache_accounting(s->iop.c, s->d, + if (s->prefetch) + pr_debug("prefetch request; do not count cache_missed"); + else + bch_mark_cache_accounting(s->iop.c, s->d, !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);

+ if (!s->prefetch && !s->iop.status) { + s->smp.type = ACACHE_INFO_LATENCY; + s->smp.start_time = ktime_get_ns() - s->smp.start_time; + save_circ_item(&s->smp); + } + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc)) @@ -891,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, }

if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + !(bio->bi_opf & (REQ_META|REQ_PRIO) ) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && + !s->prefetch) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio));

@@ -943,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; - /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); + if (!s->prefetch) { + miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + } else { + bio_put(miss); + } return ret; }

-static void cached_dev_read(struct cached_dev *dc, struct search *s) +void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;

@@ -1197,7 +1215,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) bio->bi_iter.bi_sector += dc->sb.data_offset;

if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, false); trace_bcache_request_start(s->d, bio);

if (!bio->bi_iter.bi_size) { @@ -1211,6 +1229,15 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } else { s->iop.bypass = check_should_bypass(dc, bio);

+ if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { + s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; + s->smp.length = bio->bi_iter.bi_size; + s->smp.type = rw; + s->smp.dev = dc->bdev->bd_dev; + s->smp.start_time = ktime_get_ns(); + save_circ_item(&s->smp); + } + if (rw) cached_dev_write(dc, s); else @@ -1281,7 +1308,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; }

- s = search_alloc(bio, d); + s = search_alloc(bio, d, false); cl = &s->cl; bio = &s->bio.bio;

diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 82b38366a95d..21678037d215 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ +#include "btree.h" +#include "acache.h"

struct data_insert_op { struct closure cl; @@ -44,4 +46,34 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio);

extern struct kmem_cache *bch_search_cache;

+struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; + + struct hd_struct *part; + unsigned long start_time; + /* for prefetch, we do not need copy data to bio */ + bool prefetch; + struct list_head list_node; + wait_queue_head_t wqh; + struct acache_info smp; + + struct btree_op op; + struct data_insert_op iop; +}; + +void search_free(struct closure *cl); +struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch); +void cached_dev_read(struct cached_dev *dc, struct search *s); #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 81f1cc5b3499..a849fc51cd88 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -8,6 +8,7 @@ */

#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "extents.h" @@ -2846,6 +2847,7 @@ static void bcache_exit(void)

if (bcache_major) unregister_blkdev(bcache_major, "bcache"); + acache_dev_exit(); unregister_reboot_notifier(&reboot); mutex_destroy(&bch_register_lock); } @@ -2932,6 +2934,8 @@ static int __init bcache_init(void)

bch_debug_init(); closure_debug_init(); + if (acache_dev_init()) + goto err;

bcache_is_reboot = false;

diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..f7be8c6e7cff 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("bucket %zu", __entry->bucket) );

+/* acache.c */ +DEFINE_EVENT(bcache_request, bcache_prefetch_request, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */

DEFINE_EVENT(bcache_request, bcache_request_start, @@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_ARGS(bio) );

+DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + TRACE_EVENT(bcache_read, TP_PROTO(struct bio *bio, bool hit, bool bypass), TP_ARGS(bio, hit, bypass),

Zheng Zengkai

1270

Age (days ago)

1271

Last active (days ago)

List overview

17 comments

2 participants

participants (2)

Xie XiuQi
Zheng Zengkai

[PATCH openEuler-5.10 01/17] tcp: Add some stub info for KABI consistency

tags

participants (2)