Add a framework to prefetch in bcache. This framework is useful to extend current readahead in bcache, current simply append piece of data of a fixed length, which cannot increase performance well. The framework records every read request from a bcache device and send them to a userspace client, which calculates proper prefetch length ot each request and send them to kernel, and execute prefetch asynchronously.
Li Ruilin (5): bcache: add a framework to perform prefetch bcache: provide a switch to bypass all IO requests bcache: inflight prefetch requests block overlapped normal requests bcache: Delay to invalidate cache data in writearound write bcache: Add a sample of userspace prefetch client
Documentation/admin-guide/bcache.rst | 4 + drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 591 +++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 79 ++++ drivers/md/bcache/bcache.h | 8 +- drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 131 ++++-- drivers/md/bcache/request.h | 31 ++ drivers/md/bcache/stats.c | 13 + drivers/md/bcache/stats.h | 3 + drivers/md/bcache/super.c | 6 + drivers/md/bcache/sysfs.c | 12 + include/trace/events/bcache.h | 22 + samples/acache_client/Makefile | 13 + samples/acache_client/connect.c | 144 +++++++ samples/acache_client/connect.h | 74 ++++ samples/acache_client/main.c | 133 ++++++ 17 files changed, 1222 insertions(+), 48 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h create mode 100644 samples/acache_client/Makefile create mode 100644 samples/acache_client/connect.c create mode 100644 samples/acache_client/connect.h create mode 100644 samples/acache_client/main.c
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26 CVE: NA
------------------------------
Add a framwork to transform io informations to userspace client and process prefetch request sent by userspace client. Create a char device namede "acache" for connecting between kernelspace and userspace. Save informations of all io requests into a buffer and pass them to client when client reads from the device.
The prefetch request could be treated as normal io request. As deference, those requests have no need return data back to userspace, and they should not append readahead part.
Add two parameters. acache_dev_size is for controlling size of buffer to save io informations. acache_prefetch_workers is for controlling max threads to process prefetch requests.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- drivers/md/bcache/Makefile | 2 +- drivers/md/bcache/acache.c | 478 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 69 +++++ drivers/md/bcache/bcache.h | 5 +- drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/request.c | 109 +++++--- drivers/md/bcache/request.h | 31 +++ drivers/md/bcache/super.c | 4 + include/trace/events/bcache.h | 11 + 9 files changed, 669 insertions(+), 44 deletions(-) create mode 100644 drivers/md/bcache/acache.c create mode 100644 drivers/md/bcache/acache.h
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index fd714628da6a..f4d5cd626496 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -3,5 +3,5 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\ util.o writeback.o diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c new file mode 100644 index 000000000000..1f4b71370dee --- /dev/null +++ b/drivers/md/bcache/acache.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/cdev.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/circ_buf.h> +#include <linux/list.h> + +#include "acache.h" +#include "request.h" + +#include <trace/events/bcache.h> + +#define DEV_NAME "acache" + +int acache_dev_size = (1024 * 4096 + 4096); + +module_param_named(acache_size, acache_dev_size, int, 0444); +MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte"); + +int acache_prefetch_workers = 1000; + +module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); +MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests"); + +struct prefetch_worker { + struct acache_info s; + struct work_struct work; + struct list_head list; +}; + +struct acache_device { + bool initialized; + + dev_t devno; + struct cdev cdev; + struct class *class; + struct mem_reg *mem_regionp; + + struct acache_info *readbuf; + struct acache_info *writebuf; + + struct acache_circ *acache_info_circ; + + struct workqueue_struct *wq; + struct prefetch_worker *prefetch_workers; + struct list_head prefetch_workers_free; + spinlock_t prefetch_workers_free_list_lock; +} adev; + +#define MAX_TRANSFER_SIZE (1024 * 1024) + +static atomic_t acache_opened_dev = ATOMIC_INIT(0); +static struct acache_metadata metadata; + + +int acache_open(struct inode *inode, struct file *filp) +{ + struct mem_reg *dev; + + int minor = MINOR(inode->i_rdev); + + if (minor >= ACACHE_NR_DEVS) + return -ENODEV; + if (atomic_xchg(&acache_opened_dev, 1)) + return -EPERM; + + dev = &adev.mem_regionp[minor]; + + filp->private_data = dev; + + return 0; +} + +int acache_release(struct inode *inode, struct file *filp) +{ + atomic_dec(&acache_opened_dev); + return 0; +} + +ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf, + size_t size) +{ + unsigned long first, todo, flags; + + spin_lock_irqsave(&circ->lock, flags); + + todo = CIRC_CNT(circ->head, circ->tail, circ->size); + if (todo == 0) { + spin_unlock_irqrestore(&circ->lock, flags); + return 0; + } + if (todo > size / sizeof(struct acache_info)) + todo = size / sizeof(struct acache_info); + + first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size); + if (first > todo) + first = todo; + + memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info)); + if (first < todo) + memcpy(buf + first, circ->data, + (todo - first) * sizeof(struct acache_info)); + circ->tail = (circ->tail + todo) & (circ->size - 1); + + spin_unlock_irqrestore(&circ->lock, flags); + return todo * sizeof(struct acache_info); +} + +static ssize_t acache_read(struct file *filp, char __user *buf, + size_t size, loff_t *ppos) +{ + long ret, cut; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size); + if (ret <= 0) + return ret; + + cut = copy_to_user(buf, adev.readbuf, size); + return ret - cut; +} + +int process_one_request(struct acache_info *item); +static void prefetch_worker_func(struct work_struct *work) +{ + struct prefetch_worker *sw = + container_of(work, struct prefetch_worker, work); + + process_one_request(&sw->s); + spin_lock(&adev.prefetch_workers_free_list_lock); + list_add_tail(&sw->list, &adev.prefetch_workers_free); + spin_unlock(&adev.prefetch_workers_free_list_lock); +} + +static int queue_prefetch_item(struct acache_info *s) +{ + struct prefetch_worker *sw; + + spin_lock(&adev.prefetch_workers_free_list_lock); + sw = list_first_entry_or_null(&adev.prefetch_workers_free, + struct prefetch_worker, list); + if (!sw) { + spin_unlock(&adev.prefetch_workers_free_list_lock); + return -1; + } + list_del_init(&sw->list); + spin_unlock(&adev.prefetch_workers_free_list_lock); + + memcpy(&sw->s, s, sizeof(struct acache_info)); + INIT_WORK(&sw->work, prefetch_worker_func); + queue_work(adev.wq, &sw->work); + return 0; +} + +static ssize_t acache_write(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos) +{ + long cut; + int i; + + if (metadata.conntype != ACACHE_READWRITE_CONN) + return -EINVAL; + + if (size > MAX_TRANSFER_SIZE) + size = MAX_TRANSFER_SIZE; + + cut = copy_from_user(adev.writebuf, buf, size); + for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) { + if (queue_prefetch_item(adev.writebuf + i)) + break; + } + return i * sizeof(struct acache_info); +} + +static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case ACACHE_GET_METADATA: + return copy_to_user((struct acache_metadata __user *)arg, + &metadata, sizeof(struct acache_metadata)); + default: + return -EINVAL; + } +} + +static const struct file_operations acache_fops = { + .owner = THIS_MODULE, + .read = acache_read, + .write = acache_write, + .open = acache_open, + .release = acache_release, + .unlocked_ioctl = acache_ioctl, +}; + +void save_circ_item(struct acache_info *data) +{ + unsigned long flags; + struct acache_circ *circ = adev.acache_info_circ; + + spin_lock_irqsave(&circ->lock, flags); + if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) { + memcpy(&circ->data[circ->head], data, sizeof(struct acache_info)); + circ->head = (circ->head + 1) & (circ->size - 1); + } else { + pr_debug("ringbuffer is full; discard new request."); + } + spin_unlock_irqrestore(&circ->lock, flags); +} + +void init_acache_circ(struct acache_circ **circ, void *startaddr) +{ + *circ = (struct acache_circ *)startaddr; + (*circ)->head = 0; + (*circ)->tail = 0; + (*circ)->size = ACACHE_CIRC_SIZE; + spin_lock_init(&(*circ)->lock); +} + +static void acache_free_mem(void) +{ + int i; + + for (i = 0; i < ACACHE_NR_DEVS; i++) + vfree(adev.mem_regionp[i].data); + + if (adev.readbuf) { + vfree(adev.readbuf); + adev.readbuf = NULL; + } + if (adev.writebuf) { + vfree(adev.writebuf); + adev.writebuf = NULL; + } + + kfree(adev.prefetch_workers); + adev.prefetch_workers = NULL; +} + +int acache_prefetch_init(struct acache_device *adev) +{ + int i; + + if (acache_prefetch_workers <= 0) { + pr_err("acache_dev_size should not be less than zero"); + return -1; + } + adev->prefetch_workers = kmalloc_array(acache_prefetch_workers, + sizeof(struct prefetch_worker), + GFP_KERNEL); + if (!adev->prefetch_workers) + goto fail_prefetch_workers_alloc; + + INIT_LIST_HEAD(&adev->prefetch_workers_free); + spin_lock_init(&adev->prefetch_workers_free_list_lock); + for (i = 0; i < acache_prefetch_workers; i++) { + spin_lock(&adev->prefetch_workers_free_list_lock); + list_add_tail(&adev->prefetch_workers[i].list, + &adev->prefetch_workers_free); + spin_unlock(&adev->prefetch_workers_free_list_lock); + } + + adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0); + if (!adev->wq) + goto fail_workqueue_alloc; + + return 0; + +fail_workqueue_alloc: + kfree(adev->prefetch_workers); + adev->prefetch_workers = NULL; +fail_prefetch_workers_alloc: + if (adev->wq) + destroy_workqueue(adev->wq); + return -1; +} + +int acache_dev_init(void) +{ + int ret; + int i; + int major; + struct device *dev; + + major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); + if (major < 0) { + pr_err("failed to allocate chrdev region: %d", major); + return major; + goto fail_allocdev; + } + + adev.class = class_create(THIS_MODULE, DEV_NAME); + if (IS_ERR(adev.class)) { + pr_err("failed to create acache class"); + ret = -1; + goto fail_class; + } + + if (acache_dev_size < PAGE_SIZE) { + pr_err("acache_dev_size should not be less than PAGE_SIZE"); + ret = -1; + goto fail_dev_add; + } + metadata.devsize = acache_dev_size; + metadata.magic = ACACHE_MAGIC; + metadata.conntype = ACACHE_READWRITE_CONN; + cdev_init(&adev.cdev, &acache_fops); + adev.cdev.owner = THIS_MODULE; + + ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS); + if (ret < 0) { + pr_err("failed to add cdev"); + goto fail_dev_add; + } + + dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME); + if (IS_ERR(dev)) { + pr_err("Could not create device"); + ret = -1; + goto fail_device; + } + + adev.readbuf = vmalloc(MAX_TRANSFER_SIZE); + adev.writebuf = vmalloc(MAX_TRANSFER_SIZE); + if (!adev.readbuf || !adev.writebuf) { + ret = -ENOMEM; + goto fail_malloc; + } + + adev.initialized = true; + adev.mem_regionp = + kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL); + if (!adev.mem_regionp) { + ret = -ENOMEM; + goto fail_malloc; + } + memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS); + + for (i = 0; i < ACACHE_NR_DEVS; i++) { + adev.mem_regionp[i].size = ACACHE_DEV_SIZE; + adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE); + if (!adev.mem_regionp[i].data) { + ret = -ENOMEM; + goto fail_memregion_data_malloc; + } + memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE); + } + + init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data); + if (acache_prefetch_init(&adev)) + goto fail_prefetch_init; + + return 0; + +fail_prefetch_init: +fail_memregion_data_malloc: + acache_free_mem(); +fail_malloc: + device_destroy(adev.class, adev.devno); +fail_device: + cdev_del(&adev.cdev); +fail_dev_add: + class_destroy(adev.class); +fail_class: + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); +fail_allocdev: + return ret; +} + +void acache_dev_exit(void) +{ + if (!adev.initialized) + return; + + if (adev.wq) { + flush_workqueue(adev.wq); + destroy_workqueue(adev.wq); + } + device_destroy(adev.class, adev.devno); + cdev_del(&adev.cdev); + acache_free_mem(); + kfree(adev.mem_regionp); + unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); + class_destroy(adev.class); + kfree(adev.prefetch_workers); +} + +struct cached_dev *get_cached_device_by_dev(dev_t dev) +{ + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev->bd_dev == dev && cached_dev_get(dc)) + return dc; + + return NULL; +} + +struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item) +{ + struct bio *bio; + uint64_t offset = item->offset + dc->sb.data_offset; + + if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) { + pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %lx", + MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev), + offset + (item->length >> 9), + get_capacity(dc->bdev->bd_disk)); + return NULL; + } + + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split); + if (!bio) { + bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL); + if (!bio) + return NULL; + } + + bio_set_dev(bio, dc->bdev); + bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset; + bio->bi_iter.bi_size = (item->length >> 9) << 9; + + bch_bio_map(bio, NULL); + if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO)) + goto out_put; + + return bio; +out_put: + bio_put(bio); + return NULL; +} + +int process_one_request(struct acache_info *item) +{ + struct cached_dev *dc; + struct bio *cache_bio; + struct search *s; + + dc = get_cached_device_by_dev(item->dev); + if (dc == NULL) + return -1; + cache_bio = get_bio_by_item(dc, item); + if (cache_bio == NULL) { + pr_err("acache: failed to alloc bio for prefetch"); + goto put_dev; + } + + s = search_alloc(cache_bio, &dc->disk, true); + + trace_bcache_prefetch_request(&dc->disk, cache_bio); + generic_start_io_acct(cache_bio->bi_disk->queue, + bio_op(cache_bio), + bio_sectors(cache_bio), + &s->d->disk->part0); + + cached_dev_read(dc, s); + return 0; + +put_dev: + cached_dev_put(dc); + return -1; +} + diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h new file mode 100644 index 000000000000..dea6e8cb0a05 --- /dev/null +++ b/drivers/md/bcache/acache.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ACHACHE_INTERFACE_H_ +#define _ACHACHE_INTERFACE_H_ + +#define ACACHE_NR_DEVS 1 + +#define RING_SIZE + +#include "bcache.h" + +struct mem_reg { + char *data; + unsigned long size; +}; + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + dev_t dev; + int type; +}; + +enum acache_info_type { + ACACHE_INFO_READ = 0, + ACACHE_INFO_WRITE, + ACACHE_INFO_CACHE_INSERT, + ACACHE_INFO_LATENCY, +}; + +struct acache_circ { + spinlock_t lock; + int tail; + int head; + int size; + int item_size; + struct acache_info data[0]; +}; + +struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; +}; + +#define ACACHE_DEV_SIZE acache_dev_size +#define ACACHE_MAGIC 2 + +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_READWRITE_CONN = 2, +}; + +#define ACACHE_CIRC_SIZE \ + ({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \ + int bits = 0; \ + while (i > 0) {i >>= 1; bits++; } \ + 1 << (bits - 1); }) + + +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + +int acache_dev_init(void); +void acache_dev_exit(void); +struct acache_info *fetch_circ_item(struct acache_circ *circ); +void save_circ_item(struct acache_info *data); + +#endif diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 756fc5425d9b..8a65a859bc48 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -176,8 +176,11 @@ * - updates to non leaf nodes just happen synchronously (see btree_split()). */
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#ifdef pr_fmt +#undef pr_fmt
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#endif #include <linux/bcache.h> #include <linux/bio.h> #include <linux/kobject.h> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 8a075fac5d36..0dc71d561050 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2020,12 +2020,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k, BUG_ON(bkey_cmp(k, &b->key) > 0);
status = bch_btree_insert_key(&b->keys, k, replace_key); + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert");
- trace_bcache_btree_insert_key(b, k, replace_key != NULL, - status); return true; } else return false; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 820d8402a1dc..55588d13255d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -8,11 +8,13 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "request.h" #include "writeback.h"
+#include <linux/time.h> #include <linux/module.h> #include <linux/hash.h> #include <linux/random.h> @@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl) void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct acache_info msg;
trace_bcache_write(op->c, op->inode, op->bio, op->writeback, op->bypass);
+ msg.offset = op->bio->bi_iter.bi_sector; + msg.length = op->bio->bi_iter.bi_size; + msg.type = ACACHE_INFO_CACHE_INSERT; + msg.dev = bio_dev(op->bio); + msg.start_time = ktime_get_ns(); + save_circ_item(&msg); + bch_keylist_init(&op->insert_keys); bio_get(op->bio); bch_data_insert_start(cl); @@ -460,27 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/* Cache lookup */
-struct search { - /* Stack frame for bio_complete */ - struct closure cl; - - struct bbio bio; - struct bio *orig_bio; - struct bio *cache_miss; - struct bcache_device *d; - - unsigned int insert_bio_sectors; - unsigned int recoverable:1; - unsigned int write:1; - unsigned int read_dirty_data:1; - unsigned int cache_missed:1; - - unsigned long start_time; - - struct btree_op op; - struct data_insert_op iop; -}; - static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -539,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) return MAP_CONTINUE;
/* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0;
PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; @@ -556,6 +546,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
+ if (!s->prefetch) { n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl;
@@ -571,6 +562,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) */
__bch_submit_bbio(n, b->c); + } else { + bio_put(n); + } return n == bio ? MAP_DONE : MAP_CONTINUE; }
@@ -673,7 +667,12 @@ static void bio_complete(struct search *s)
trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; - bio_endio(s->orig_bio); + if (s->prefetch) { + bio_free_pages(s->orig_bio); + bio_put(s->orig_bio); + } else { + bio_endio(s->orig_bio); + } s->orig_bio = NULL; } } @@ -698,7 +697,7 @@ static void do_bio_hook(struct search *s, bio_cnt_set(bio, 3); }
-static void search_free(struct closure *cl) +void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);
@@ -712,8 +711,8 @@ static void search_free(struct closure *cl) mempool_free(s, &s->iop.c->search); }
-static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) +struct search *search_alloc(struct bio *bio, + struct bcache_device *d, bool prefetch) { struct search *s;
@@ -731,6 +730,7 @@ static inline struct search *search_alloc(struct bio *bio, s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; s->start_time = jiffies; + s->prefetch = prefetch;
s->iop.c = d->c; s->iop.bio = NULL; @@ -830,22 +830,27 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio) { bio_reset(s->iop.bio); s->iop.bio->bi_iter.bi_sector = - s->cache_miss->bi_iter.bi_sector; + s->cache_miss->bi_iter.bi_sector; bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL);
- bio_copy_data(s->cache_miss, s->iop.bio); + if (!s->prefetch) + bio_copy_data(s->cache_miss, s->iop.bio); + else + trace_bcache_prefetch_cache_miss(s->iop.bio);
bio_put(s->cache_miss); s->cache_miss = NULL; + }
if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio);
closure_get(&dc->disk.cl); - bio_complete(s); + if (!s->prefetch) + bio_complete(s);
if (s->iop.bio && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { @@ -861,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- bch_mark_cache_accounting(s->iop.c, s->d, + if (s->prefetch) + pr_debug("prefetch request; do not count cache_missed"); + else + bch_mark_cache_accounting(s->iop.c, s->d, !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
+ if (!s->prefetch && !s->iop.status) { + s->smp.type = ACACHE_INFO_LATENCY; + s->smp.start_time = ktime_get_ns() - s->smp.start_time; + save_circ_item(&s->smp); + } + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc)) @@ -890,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, }
if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & (REQ_META|REQ_PRIO)) && - s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + !(bio->bi_opf & REQ_META) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && + !s->prefetch) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio));
@@ -932,8 +947,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, if (reada) bch_mark_cache_readahead(s->iop.c, s->d);
- s->cache_miss = miss; - s->iop.bio = cache_bio; + s->cache_miss = miss; + s->iop.bio = cache_bio; bio_get(cache_bio); /* I/O request sent to backing device */ closure_bio_submit(s->iop.c, cache_bio, &s->cl); @@ -942,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = backing_request_endio; - miss->bi_private = &s->cl; - /* I/O request sent to backing device */ - closure_bio_submit(s->iop.c, miss, &s->cl); + if (!s->prefetch) { + miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + } else { + bio_put(miss); + } return ret; }
-static void cached_dev_read(struct cached_dev *dc, struct search *s) +void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;
@@ -1196,11 +1215,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, bio_sectors(bio), &d->disk->part0);
+ bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, false); trace_bcache_request_start(s->d, bio);
if (!bio->bi_iter.bi_size) { @@ -1214,6 +1234,15 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, } else { s->iop.bypass = check_should_bypass(dc, bio);
+ if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { + s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset; + s->smp.length = bio->bi_iter.bi_size; + s->smp.type = rw; + s->smp.dev = dc->bdev->bd_dev; + s->smp.start_time = ktime_get_ns(); + save_circ_item(&s->smp); + } + if (rw) cached_dev_write(dc, s); else @@ -1316,7 +1345,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
- s = search_alloc(bio, d); + s = search_alloc(bio, d, false); cl = &s->cl; bio = &s->bio.bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c64dbd7a91aa..6366b8861974 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_REQUEST_H_ #define _BCACHE_REQUEST_H_ +#include "btree.h" +#include "acache.h"
struct data_insert_op { struct closure cl; @@ -41,4 +43,33 @@ void bch_flash_dev_request_init(struct bcache_device *d);
extern struct kmem_cache *bch_search_cache;
+struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; + + unsigned long start_time; + /* for prefetch, we do not need copy data to bio */ + bool prefetch; + struct list_head list_node; + wait_queue_head_t wqh; + struct acache_info smp; + + struct btree_op op; + struct data_insert_op iop; +}; + +void search_free(struct closure *cl); +struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch); +void cached_dev_read(struct cached_dev *dc, struct search *s); #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 94045d72952c..b272f0c1ff3b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -8,6 +8,7 @@ */
#include "bcache.h" +#include "acache.h" #include "btree.h" #include "debug.h" #include "extents.h" @@ -2625,6 +2626,7 @@ static void bcache_exit(void)
if (bcache_major) unregister_blkdev(bcache_major, "bcache"); + acache_dev_exit(); unregister_reboot_notifier(&reboot); mutex_destroy(&bch_register_lock); } @@ -2695,6 +2697,8 @@ static int __init bcache_init(void)
bch_debug_init(); closure_debug_init(); + if (acache_dev_init()) + goto err;
bcache_is_reboot = false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e4526f85c19d..cb15af32291e 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("bucket %zu", __entry->bucket) );
+/* readahead.c */ +DEFINE_EVENT(bcache_request, bcache_prefetch_request, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start, @@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_ARGS(bio) );
+DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + TRACE_EVENT(bcache_read, TP_PROTO(struct bio *bio, bool hit, bool bypass), TP_ARGS(bio, hit, bypass),
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26 CVE: NA
------------------------------
provide a switch named read_bypass. If enbale, all IO requests will bypass the cache. This option could be useful when we enable userspace prefetch and the cache device is low capacity.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- Documentation/admin-guide/bcache.rst | 4 ++++ drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/request.c | 6 ++++-- drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 6 ++++++ 5 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index c0ce64d75bbf..44ae47ea5f43 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -434,6 +434,10 @@ sequential_cutoff most recent 128 IOs are tracked so sequential IO can be detected even when it isn't all done at once.
+read_bypass + If enbale, all IO will bypass the cache. This option could be useful when we + enable userspace prefetch and the cache device is low capacity. + sequential_merge If non zero, bcache keeps a list of the last 128 requests submitted to compare against all new requests to determine which new requests are sequential diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 8a65a859bc48..f2bb640b740f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -379,6 +379,8 @@ struct cached_dev { unsigned char writeback_percent; unsigned int writeback_delay;
+ unsigned int read_bypass; + uint64_t writeback_rate_target; int64_t writeback_rate_proportional; int64_t writeback_rate_integral; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 55588d13255d..800a7ba00fbe 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -852,7 +852,7 @@ static void cached_dev_read_done(struct closure *cl) if (!s->prefetch) bio_complete(s);
- if (s->iop.bio && + if (s->iop.bio && (!dc->read_bypass || s->prefetch) && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); closure_call(&s->iop.cl, bch_data_insert, NULL, cl); @@ -897,12 +897,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_missed = 1;
- if (s->cache_miss || s->iop.bypass) { + if (s->cache_miss || s->iop.bypass || + (dc->read_bypass && !s->prefetch)) { miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; }
+ /* if called form do_readahead, no need to do this */ if (!(bio->bi_opf & REQ_RAHEAD) && !(bio->bi_opf & REQ_META) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA && diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b272f0c1ff3b..169e6ad4f16a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1321,6 +1321,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
dc->sequential_cutoff = 4 << 20; + dc->read_bypass = 0;
for (io = dc->io; io < dc->io + RECENT_IO; io++) { list_add(&io->lru, &dc->io_lru); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3470fae4eabc..4adc22b11287 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -104,6 +104,7 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff); +rw_attribute(read_bypass); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(readahead_cache_policy); @@ -248,6 +249,7 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff); + var_print(read_bypass); var_hprint(readahead);
sysfs_print(running, atomic_read(&dc->running)); @@ -342,6 +344,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(sequential_cutoff, dc->sequential_cutoff, 0, UINT_MAX); + sysfs_strtoul_clamp(read_bypass, + dc->read_bypass, + 0, 1); d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats) @@ -507,6 +512,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, + &sysfs_read_bypass, &sysfs_clear_stats, &sysfs_running, &sysfs_state,
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26 CVE: NA
------------------------------
Add a list to save all prefetch requests. When an IO request comes, check if the request has overlap with some of prefetch requests. If it das have, block the request until the prefetch request is end.
Add a switch to control whether to enable this. If not enabled, count the overlapped IO request as a fake hit for performance analysis.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- drivers/md/bcache/acache.c | 113 ++++++++++++++++++++++++++++++++++ drivers/md/bcache/acache.h | 10 +++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/request.c | 8 +++ drivers/md/bcache/stats.c | 13 ++++ drivers/md/bcache/stats.h | 3 + drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 6 ++ include/trace/events/bcache.h | 11 ++++ 9 files changed, 166 insertions(+)
diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c index 1f4b71370dee..e87c53d4d609 100644 --- a/drivers/md/bcache/acache.c +++ b/drivers/md/bcache/acache.c @@ -31,6 +31,12 @@ int acache_prefetch_workers = 1000; module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444); MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests");
+struct inflight_list_head { + struct list_head entry; + spinlock_t io_lock; + bool initialized; +}; + struct prefetch_worker { struct acache_info s; struct work_struct work; @@ -50,6 +56,8 @@ struct acache_device {
struct acache_circ *acache_info_circ;
+ struct inflight_list_head inflight_list; + struct workqueue_struct *wq; struct prefetch_worker *prefetch_workers; struct list_head prefetch_workers_free; @@ -295,6 +303,7 @@ int acache_dev_init(void) int major; struct device *dev;
+ inflight_list_ops.init(); major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME); if (major < 0) { pr_err("failed to allocate chrdev region: %d", major); @@ -377,6 +386,7 @@ int acache_dev_init(void) fail_class: unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); fail_allocdev: + inflight_list_ops.exit(); return ret; }
@@ -395,9 +405,112 @@ void acache_dev_exit(void) kfree(adev.mem_regionp); unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS); class_destroy(adev.class); + inflight_list_ops.exit(); kfree(adev.prefetch_workers); }
+static struct search *__inflight_list_lookup_locked(struct search *s) +{ + struct search *iter; + struct bio *bio, *sbio; + + if (!adev.inflight_list.initialized) + return NULL; + sbio = &s->bio.bio; + list_for_each_entry(iter, &adev.inflight_list.entry, list_node) { + bio = &iter->bio.bio; + if (sbio->bi_disk == bio->bi_disk && + sbio->bi_iter.bi_sector < bio_end_sector(bio) && + bio_end_sector(sbio) > bio->bi_iter.bi_sector) { + return iter; + } + } + return NULL; +} + +static void inflight_list_init(void) +{ + INIT_LIST_HEAD(&adev.inflight_list.entry); + spin_lock_init(&adev.inflight_list.io_lock); + adev.inflight_list.initialized = true; +} + +static void inflight_list_exit(void) +{ + if (!list_empty(&adev.inflight_list.entry)) + pr_err("existing with inflight list not empty"); +} + +static int inflight_list_insert(struct search *s) +{ + if (!adev.inflight_list.initialized) + return -1; + + init_waitqueue_head(&s->wqh); + spin_lock(&adev.inflight_list.io_lock); + list_add_tail(&s->list_node, &adev.inflight_list.entry); + spin_unlock(&adev.inflight_list.io_lock); + + trace_bcache_inflight_list_insert(s->d, s->orig_bio); + return 0; +} + +static int inflight_list_remove(struct search *s) +{ + if (!adev.inflight_list.initialized) + return -1; + + spin_lock(&adev.inflight_list.io_lock); + list_del_init(&s->list_node); + spin_unlock(&adev.inflight_list.io_lock); + + wake_up_interruptible_all(&s->wqh); + + trace_bcache_inflight_list_remove(s->d, s->orig_bio); + return 0; +} + +static bool inflight_list_wait(struct search *s) +{ + struct search *pfs = NULL; + struct cached_dev *dc; + DEFINE_WAIT(wqe); + + if (!adev.inflight_list.initialized) + return false; + + spin_lock(&adev.inflight_list.io_lock); + pfs = __inflight_list_lookup_locked(s); + if (pfs == NULL) { + spin_unlock(&adev.inflight_list.io_lock); + return false; + } + + dc = container_of(pfs->d, struct cached_dev, disk); + if (!dc->inflight_block_enable) { + spin_unlock(&adev.inflight_list.io_lock); + return true; + } + + prepare_to_wait(&pfs->wqh, &wqe, TASK_INTERRUPTIBLE); + + /* unlock here to ensure pfs not changed. */ + spin_unlock(&adev.inflight_list.io_lock); + schedule(); + + finish_wait(&pfs->wqh, &wqe); + + return true; +} + +const struct inflight_queue_ops inflight_list_ops = { + .init = inflight_list_init, + .exit = inflight_list_exit, + .insert = inflight_list_insert, + .remove = inflight_list_remove, + .wait = inflight_list_wait, +}; + struct cached_dev *get_cached_device_by_dev(dev_t dev) { struct cache_set *c, *tc; diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h index dea6e8cb0a05..3c6453d0c4da 100644 --- a/drivers/md/bcache/acache.h +++ b/drivers/md/bcache/acache.h @@ -66,4 +66,14 @@ void acache_dev_exit(void); struct acache_info *fetch_circ_item(struct acache_circ *circ); void save_circ_item(struct acache_info *data);
+struct inflight_queue_ops { + void (*init)(void); + void (*exit)(void); + + int (*insert)(struct search *s); + int (*remove)(struct search *s); + bool (*wait)(struct search *s); +}; +extern const struct inflight_queue_ops inflight_list_ops; + #endif diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index f2bb640b740f..3340f5911711 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -379,6 +379,7 @@ struct cached_dev { unsigned char writeback_percent; unsigned int writeback_delay;
+ unsigned int inflight_block_enable; unsigned int read_bypass;
uint64_t writeback_rate_target; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 800a7ba00fbe..7eff3c6cf0f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -706,6 +706,9 @@ void search_free(struct closure *cl) if (s->iop.bio) bio_put(s->iop.bio);
+ if (s->prefetch) + inflight_list_ops.remove(s); + bio_complete(s); closure_debug_destroy(cl); mempool_free(s, &s->iop.c->search); @@ -974,6 +977,11 @@ void cached_dev_read(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl;
+ if (s->prefetch) + inflight_list_ops.insert(s); + else if (inflight_list_ops.wait(s)) + bch_mark_cache_prefetch_fake_hit(s->iop.c, s->d); + closure_call(&s->iop.cl, cache_lookup, NULL, cl); continue_at(cl, cached_dev_read_done_bh, NULL); } diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..c7a6c93aa9e9 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -48,6 +48,7 @@ read_attribute(cache_bypass_misses); read_attribute(cache_hit_ratio); read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); +read_attribute(cache_prefetch_fake_hits); read_attribute(bypassed);
SHOW(bch_stats) @@ -66,6 +67,7 @@ SHOW(bch_stats)
var_print(cache_readaheads); var_print(cache_miss_collisions); + var_print(cache_prefetch_fake_hits); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var return 0; @@ -88,6 +90,7 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_hit_ratio, &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, + &sysfs_cache_prefetch_fake_hits, &sysfs_bypassed, NULL }; @@ -147,6 +150,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_bypass_misses); scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); + scale_stat(&stats->cache_prefetch_fake_hits); scale_stat(&stats->sectors_bypassed); } } @@ -170,6 +174,7 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_bypass_misses); move_stat(cache_readaheads); move_stat(cache_miss_collisions); + move_stat(cache_prefetch_fake_hits); move_stat(sectors_bypassed);
scale_stats(&acc->total, 0); @@ -225,6 +230,14 @@ void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) atomic_inc(&c->accounting.collector.cache_miss_collisions); }
+void bch_mark_cache_prefetch_fake_hit(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + atomic_inc(&dc->accounting.collector.cache_prefetch_fake_hits); + atomic_inc(&c->accounting.collector.cache_prefetch_fake_hits); +} + void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, int sectors) { diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..302b76e982b4 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -9,6 +9,7 @@ struct cache_stat_collector { atomic_t cache_bypass_misses; atomic_t cache_readaheads; atomic_t cache_miss_collisions; + atomic_t cache_prefetch_fake_hits; atomic_t sectors_bypassed; };
@@ -21,6 +22,7 @@ struct cache_stats { unsigned long cache_bypass_misses; unsigned long cache_readaheads; unsigned long cache_miss_collisions; + unsigned long cache_prefetch_fake_hits; unsigned long sectors_bypassed;
unsigned int rescale; @@ -58,6 +60,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d); +void bch_mark_cache_prefetch_fake_hit(struct cache_set *c, struct bcache_device *d); void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, int sectors); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 169e6ad4f16a..754e88895738 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1321,6 +1321,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
dc->sequential_cutoff = 4 << 20; + dc->inflight_block_enable = 1; dc->read_bypass = 0;
for (io = dc->io; io < dc->io + RECENT_IO; io++) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4adc22b11287..e23c42622939 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -105,6 +105,7 @@ rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff); rw_attribute(read_bypass); +rw_attribute(inflight_block_enable); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(readahead_cache_policy); @@ -249,6 +250,7 @@ SHOW(__bch_cached_dev) var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff); + var_print(inflight_block_enable); var_print(read_bypass); var_hprint(readahead);
@@ -347,6 +349,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(read_bypass, dc->read_bypass, 0, 1); + sysfs_strtoul_clamp(inflight_block_enable, + dc->inflight_block_enable, + 0, 1); d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats) @@ -513,6 +518,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_read_bypass, + &sysfs_inflight_block_enable, &sysfs_clear_stats, &sysfs_running, &sysfs_state, diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index cb15af32291e..82283c23822a 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -81,6 +81,17 @@ DEFINE_EVENT(bcache_request, bcache_prefetch_request, TP_ARGS(d, bio) );
+/* interface.c */ +DEFINE_EVENT(bcache_request, bcache_inflight_list_insert, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + +DEFINE_EVENT(bcache_request, bcache_inflight_list_remove, + TP_PROTO(struct bcache_device *d, struct bio *bio), + TP_ARGS(d, bio) +); + /* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start,
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26 CVE: NA
------------------------------
In writearound cache mode, read request quickly followed by write request may overwrite the invalidate bkey inserted by the write request.
The function bch_data_insert() is invoked asynchronously as the bio subbmited to backend block device, therefore there may be a read request subbmited after the bch_data_insert() done and ended before the backend bio is end. This read request will read data from the backend block device, and insert dirty data to cache device. However by writearound cache mode, bcache will not invalidate data again, so that read request after will read dirty data from the cache, causing a data corruption.
By this patch we delay the invalidation to end of backend bio to avoid this corruption.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- drivers/md/bcache/request.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7eff3c6cf0f1..18c8e5baa011 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -993,8 +993,11 @@ static void cached_dev_write_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+ if (!s->iop.bypass) + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + up_read_non_owner(&dc->writeback_lock); - cached_dev_bio_complete(cl); + continue_at(cl, cached_dev_bio_complete, NULL); }
static void cached_dev_write(struct cached_dev *dc, struct search *s) @@ -1077,7 +1080,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) }
insert_data: - closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + if (!s->iop.bypass) + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); }
From: Li Ruilin liruilin4@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26 CVE: NA
------------------------------
Add a sample for new prefetch frame on bcache. As a sample, the program just reads every read request received by bcache device and send a prefetch request for each read request. The length of the prefetch request is equal as the read request and the position of the prefetch request is follow the read request.
Signed-off-by: Li Ruilin liruilin4@huawei.com Reviewed-by: Luan Jianhai luanjianhai@huawei.com Reviewed-by: Peng Junyi pengjunyi1@huawei.com Acked-by: Xie Xiuqi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- samples/acache_client/Makefile | 13 +++ samples/acache_client/connect.c | 144 ++++++++++++++++++++++++++++++++ samples/acache_client/connect.h | 74 ++++++++++++++++ samples/acache_client/main.c | 133 +++++++++++++++++++++++++++++ 4 files changed, 364 insertions(+) create mode 100644 samples/acache_client/Makefile create mode 100644 samples/acache_client/connect.c create mode 100644 samples/acache_client/connect.h create mode 100644 samples/acache_client/main.c
diff --git a/samples/acache_client/Makefile b/samples/acache_client/Makefile new file mode 100644 index 000000000000..13e5485b3d2f --- /dev/null +++ b/samples/acache_client/Makefile @@ -0,0 +1,13 @@ +.PHONY: client clean + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -g + + +OBJ = main.o connect.o +client: ${OBJ} + $(CC) $(CFLAGS) $^ -o acache_client +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ +clean: + rm -f *.o acache_client diff --git a/samples/acache_client/connect.c b/samples/acache_client/connect.c new file mode 100644 index 000000000000..2dd442415ee2 --- /dev/null +++ b/samples/acache_client/connect.c @@ -0,0 +1,144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <unistd.h> +#include <errno.h> + +#include "connect.h" + +static int ACACHE_READWRITE_CAPACITY = 4096; +static struct connection readwrite_conn; +static struct readwrite_conn_metadata { + int initialized; + int fd; +} private; + +void *initialize(struct connection *self) +{ + long ret; + + private.fd = open(acache_path, O_RDWR | O_SYNC); + if (private.fd == -1) { + fprintf(stderr, "error opening device: %s\n", strerror(errno)); + exit(-1); + } + + struct acache_metadata { + uint32_t magic; + uint32_t conntype; + uint32_t devsize; + } acache_metadata; +#define ACACHE_GET_METADATA _IOR('a', 1, struct acache_metadata) + ret = ioctl(private.fd, ACACHE_GET_METADATA, &acache_metadata); + if (ret) { + fprintf(stderr, "error getting device memory length: %s\n", strerror(errno)); + exit(-1); + } + if (acache_metadata.magic != ACACHE_MAGIC) { + fprintf(stderr, "version not match; client: %u kernel: %u\n", + ACACHE_MAGIC, acache_metadata.magic); + exit(-1); + } + if (acache_metadata.conntype != ACACHE_READWRITE_CONN) { + fprintf(stderr, "connect type not match; client: %u kernel: %u\n", + ACACHE_READWRITE_CONN, acache_metadata.conntype); + exit(-1); + } + printf("got dev size %u\n", acache_metadata.devsize); + private.initialized = 1; + + return (void *)&private; +} + +struct readwrite_conn_metadata* get_metadata(struct connection *self) +{ + struct readwrite_conn_metadata *metadata; + + if (self == NULL) { + fprintf(stderr, "connenction uninitailized\n"); + return NULL; + } + + metadata = (struct readwrite_conn_metadata *)self->private; + + if (metadata->initialized == 0) { + fprintf(stderr, "connenction uninitailized\n"); + return NULL; + } + return metadata; +} + +int send_items(struct connection *self, struct acache_info *infos, + size_t count) +{ + long ret; + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + ret = write(metadata->fd, (void*)infos, count * sizeof(struct acache_info)); + if (ret < 0) { + fprintf(stderr, "error writing data: %ld\n", ret); + return 0; + } + if (ret % sizeof(struct acache_info)) { + fprintf(stderr, "error writing data: data length is not multiple of sizeof(struct acache_info): %ld %ld\n", + ret, sizeof(struct acache_info)); + return 0; + } + return ret / sizeof(struct acache_info); +} + +int fetch_items(struct connection *self, struct acache_info *infos, + size_t count) +{ + long ret; + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + ret = read(metadata->fd, (void*)infos, count * sizeof(struct acache_info)); + if (ret < 0) { + fprintf(stderr, "error reading data: %ld\n", ret); + return 0; + } + if (ret % sizeof(struct acache_info)) { + fprintf(stderr, "error reading data: data length is not multiple of sizeof(struct acache_info): %ld %ld\n", + ret, sizeof(struct acache_info)); + return 0; + } + return ret / sizeof(struct acache_info); +} + +int get_capacity() { + return ACACHE_READWRITE_CAPACITY; +} + +int close_conn(struct connection *self) +{ + struct readwrite_conn_metadata *metadata = get_metadata(self); + + if (!metadata) { + return 0; + } + close(metadata->fd); + return 0; + +} + +struct connection *initialize_conn_rw(void) +{ + readwrite_conn.ops.close = close_conn; + readwrite_conn.ops.initialize = initialize; + readwrite_conn.ops.send_items = send_items; + readwrite_conn.ops.fetch_items = fetch_items; + readwrite_conn.ops.get_capacity = get_capacity; + readwrite_conn.private = initialize(&readwrite_conn); + return &readwrite_conn; +} diff --git a/samples/acache_client/connect.h b/samples/acache_client/connect.h new file mode 100644 index 000000000000..b0357c78c8c4 --- /dev/null +++ b/samples/acache_client/connect.h @@ -0,0 +1,74 @@ +#ifndef ACACHE_CONNENECT_H +#define ACACHE_CONNENECT_H +#include <stdint.h> + +#define ACACHE_MAGIC 2 +enum acache_conn_types { + ACACHE_NO_CONN = 0, + ACACHE_RINGBUFFER_CONN, + ACACHE_READWRITE_CONN, +}; +#define acache_path "/dev/acache" + +struct acache_info { + uint64_t length; + uint64_t offset; + uint64_t start_time; + uint32_t dev; + int opcode; +}; + +struct connection; +struct connection_operations { + + /* + * initialize connnection + * parameters: none + * return values: + * - void *: private data for connection + */ + void *(*initialize)(struct connection *self); + /* + * send_items send items to peer side + * parameters: + * - infos: data to send + * - count: data length + * return values: + * - number of sent items + */ + int (*send_items)(struct connection *self, struct acache_info *infos, + size_t count); + /* + * send_items recieve items from peer side + * paremeters: + * - infos: buffer to place recieved items + * - count: length of buffer + * return values: + * - number of recieved items + */ + int (*fetch_items)(struct connection *self, struct acache_info *infos, + size_t count); + /* + * close closes the connection + */ + int (*close)(struct connection *self); + + /* + * get_capacity return the capacity of items that can send and revice at once + */ + int (*get_capacity)(struct connection *self); + +}; + +struct connection { + /* + * private data for specific connnetion + */ + void *private; + struct connection_operations ops; +}; + +struct connection *initialize_conn_rw(void); + +#endif + diff --git a/samples/acache_client/main.c b/samples/acache_client/main.c new file mode 100644 index 000000000000..929c70798cfb --- /dev/null +++ b/samples/acache_client/main.c @@ -0,0 +1,133 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> + +#include "connect.h" + +/* + * dev_t in userspace is 8-bytes long but 4-byte long in kernel + * work around this + */ +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) +#define MKDEV(ma, mi) ((ma)<<MINORBITS | (mi)) +#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) +#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) + +struct acache_info *inbuf, *outbuf; +struct connection *conn; + +void print_infos(const char *prefix, struct acache_info *infos, size_t length) +{ + size_t i; + struct acache_info *info; + + for (i = 0; i < length; i++) { + info = infos + i; + + printf("%4s,%20lu,%8u,%8u,%15lu,%12lu\n", + prefix, info->start_time, MAJOR(info->dev), + MINOR(info->dev), info->offset, info->length); + } +} + +int malloc_buffers(struct acache_info **inbuf, struct acache_info **outbuf, + size_t capacity) +{ + /* prepare buffers to store incoming or outgoing items */ + *inbuf = (struct acache_info *)malloc(sizeof(struct acache_info) * capacity); + *outbuf = (struct acache_info *)malloc(sizeof(struct acache_info) * capacity); + + if (!*inbuf || !*outbuf) { + fprintf(stderr, "error malloc memory: %s\n, size: %lu, %lu\n", + strerror(errno), + sizeof(struct acache_info) * capacity, + sizeof(struct acache_info) * capacity); + return -errno; + } + return 0; +} + +void free_buffer(struct acache_info **buf) +{ + if (buf && *buf) { + free(*buf); + *buf = NULL; + } +} + +void elegant_exit(int sig) { + printf("exiting..."); + free_buffer(&inbuf); + free_buffer(&outbuf); + conn->ops.close(conn); + exit(0); +} + +int main(int argc, char **argv) +{ + int debug = 0; + int ret; + int outbuf_tail; + size_t capacity; + + conn = initialize_conn_rw(); + + if (conn == NULL) { + fprintf(stderr, "error initialzied connnection\n"); + return -1; + } + + if (argc > 1 && strcmp("-d", argv[1]) == 0) + debug = 1; + + /* prepare buffers to store incoming or outgoing items */ + capacity = conn->ops.get_capacity(conn); + ret = malloc_buffers(&inbuf, &outbuf, capacity); + + if (ret < 0) + return ret; + + if (debug) { + printf("%4s,%20s,%8s,%8s,%15s,%12s\n", + "op","time(ns)","majorDev","minorDev","offset(B)","length(B)"); + } + /* main loop */ + if (signal(SIGINT, elegant_exit) == SIG_ERR) { + fprintf(stderr, "error handling SIGINT: %s\n", strerror(errno)); + } + if (signal(SIGTERM, elegant_exit) == SIG_ERR) { + fprintf(stderr, "error handling SIGTERM: %s\n", strerror(errno)); + } + while (1) { + unsigned int i, inlen; + + inlen = conn->ops.fetch_items(conn, inbuf, capacity); + if (!inlen) { + usleep(100); + continue; + } + + outbuf_tail = 0; + for (i = 0; i < inlen; i++) { + /* customize prefetch strategy here */ + memcpy(outbuf + outbuf_tail, inbuf + i, sizeof(struct acache_info)); + outbuf[outbuf_tail].offset += outbuf[outbuf_tail].length >> 9; + outbuf_tail++; + } + if (debug) { + print_infos("R", inbuf, inlen); + print_infos("P", outbuf, outbuf_tail); + } + if (outbuf_tail) { + conn->ops.send_items(conn, outbuf, outbuf_tail); + } + } + return 0; +}