[PATCH kernel-4.19 1/5] bcache: add a framework to perform prefetch

22 Apr 2021

From: Li Ruilin liruilin4@huawei.com
euleros inclusion
category: feature
bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=26
CVE: NA
------------------------------
Add a framwork to transform io informations to userspace client and
process prefetch request sent by userspace client. Create a char device
namede "acache" for connecting between kernelspace and userspace.
Save informations of all io requests into a buffer and pass them to
client when client reads from the device.
The prefetch request could be treated as normal io request. As deference,
those requests have no need return data back to userspace, and they
should not append readahead part.
Add two parameters. acache_dev_size is for controlling size of buffer
to save io informations. acache_prefetch_workers is for controlling
max threads to process prefetch requests.
Signed-off-by: Li Ruilin liruilin4@huawei.com
Reviewed-by: Luan Jianhai luanjianhai@huawei.com
Reviewed-by: Peng Junyi pengjunyi1@huawei.com
Acked-by: Xie Xiuqi xiexiuqi@huawei.com
Signed-off-by: Cheng Jian cj.chengjian@huawei.com
---
 drivers/md/bcache/Makefile    |   2 +-
 drivers/md/bcache/acache.c    | 478 ++++++++++++++++++++++++++++++++++
 drivers/md/bcache/acache.h    |  69 +++++
 drivers/md/bcache/bcache.h    |   5 +-
 drivers/md/bcache/btree.c     |   4 +-
 drivers/md/bcache/request.c   | 109 +++++---
 drivers/md/bcache/request.h   |  31 +++
 drivers/md/bcache/super.c     |   4 +
 include/trace/events/bcache.h |  11 +
 9 files changed, 669 insertions(+), 44 deletions(-)
 create mode 100644 drivers/md/bcache/acache.c
 create mode 100644 drivers/md/bcache/acache.h

diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index fd714628da6a..f4d5cd626496 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -3,5 +3,5 @@
 obj-$(CONFIG_BCACHE)	+= bcache.o
bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
-	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+	io.o journal.o movinggc.o request.o stats.o acache.o super.o sysfs.o trace.o\
    util.o writeback.o
diff --git a/drivers/md/bcache/acache.c b/drivers/md/bcache/acache.c
new file mode 100644
index 000000000000..1f4b71370dee
--- /dev/null
+++ b/drivers/md/bcache/acache.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/cdev.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/circ_buf.h>
+#include <linux/list.h>
+
+#include "acache.h"
+#include "request.h"
+
+#include <trace/events/bcache.h>
+
+#define DEV_NAME "acache"
+
+int acache_dev_size = (1024 * 4096 + 4096);
+
+module_param_named(acache_size, acache_dev_size, int, 0444);
+MODULE_PARM_DESC(acache_size, "size of ring buffer for size in byte");
+
+int acache_prefetch_workers = 1000;
+
+module_param_named(prefetch_workers, acache_prefetch_workers, int, 0444);
+MODULE_PARM_DESC(prefetch_workers, "num of workers for processing prefetch requests");
+
+struct prefetch_worker {
+	struct acache_info s;
+	struct work_struct work;
+	struct list_head list;
+};
+
+struct acache_device {
+	bool initialized;
+
+	dev_t devno;
+	struct cdev cdev;
+	struct class *class;
+	struct mem_reg *mem_regionp;
+
+	struct acache_info *readbuf;
+	struct acache_info *writebuf;
+
+	struct acache_circ *acache_info_circ;
+
+	struct workqueue_struct *wq;
+	struct prefetch_worker *prefetch_workers;
+	struct list_head prefetch_workers_free;
+	spinlock_t prefetch_workers_free_list_lock;
+} adev;
+
+#define MAX_TRANSFER_SIZE (1024 * 1024)
+
+static atomic_t acache_opened_dev = ATOMIC_INIT(0);
+static struct acache_metadata metadata;
+
+
+int acache_open(struct inode *inode, struct file *filp)
+{
+	struct mem_reg *dev;
+
+	int minor = MINOR(inode->i_rdev);
+
+	if (minor >= ACACHE_NR_DEVS)
+		return -ENODEV;
+	if (atomic_xchg(&acache_opened_dev, 1))
+		return -EPERM;
+
+	dev = &adev.mem_regionp[minor];
+
+	filp->private_data = dev;
+
+	return 0;
+}
+
+int acache_release(struct inode *inode, struct file *filp)
+{
+	atomic_dec(&acache_opened_dev);
+	return 0;
+}
+
+ssize_t read_circ_slice(struct acache_circ *circ, struct acache_info *buf,
+			size_t size)
+{
+	unsigned long first, todo, flags;
+
+	spin_lock_irqsave(&circ->lock, flags);
+
+	todo = CIRC_CNT(circ->head, circ->tail, circ->size);
+	if (todo == 0) {
+		spin_unlock_irqrestore(&circ->lock, flags);
+		return 0;
+	}
+	if (todo > size / sizeof(struct acache_info))
+		todo = size / sizeof(struct acache_info);
+
+	first = CIRC_CNT_TO_END(circ->head, circ->tail, circ->size);
+	if (first > todo)
+		first = todo;
+
+	memcpy(buf, circ->data + circ->tail, first * sizeof(struct acache_info));
+	if (first < todo)
+		memcpy(buf + first, circ->data,
+		       (todo - first) * sizeof(struct acache_info));
+	circ->tail = (circ->tail + todo) & (circ->size - 1);
+
+	spin_unlock_irqrestore(&circ->lock, flags);
+	return todo * sizeof(struct acache_info);
+}
+
+static ssize_t acache_read(struct file *filp, char __user *buf,
+			   size_t size, loff_t *ppos)
+{
+	long ret, cut;
+
+	if (metadata.conntype != ACACHE_READWRITE_CONN)
+		return -EINVAL;
+
+	if (size > MAX_TRANSFER_SIZE)
+		size = MAX_TRANSFER_SIZE;
+
+	ret = read_circ_slice(adev.acache_info_circ, adev.readbuf, size);
+	if (ret <= 0)
+		return ret;
+
+	cut = copy_to_user(buf, adev.readbuf, size);
+	return ret - cut;
+}
+
+int process_one_request(struct acache_info *item);
+static void prefetch_worker_func(struct work_struct *work)
+{
+	struct prefetch_worker *sw =
+	    container_of(work, struct prefetch_worker, work);
+
+	process_one_request(&sw->s);
+	spin_lock(&adev.prefetch_workers_free_list_lock);
+	list_add_tail(&sw->list, &adev.prefetch_workers_free);
+	spin_unlock(&adev.prefetch_workers_free_list_lock);
+}
+
+static int queue_prefetch_item(struct acache_info *s)
+{
+	struct prefetch_worker *sw;
+
+	spin_lock(&adev.prefetch_workers_free_list_lock);
+	sw = list_first_entry_or_null(&adev.prefetch_workers_free,
+				      struct prefetch_worker, list);
+	if (!sw) {
+		spin_unlock(&adev.prefetch_workers_free_list_lock);
+		return -1;
+	}
+	list_del_init(&sw->list);
+	spin_unlock(&adev.prefetch_workers_free_list_lock);
+
+	memcpy(&sw->s, s, sizeof(struct acache_info));
+	INIT_WORK(&sw->work, prefetch_worker_func);
+	queue_work(adev.wq, &sw->work);
+	return 0;
+}
+
+static ssize_t acache_write(struct file *filp, const char __user *buf,
+			    size_t size, loff_t *ppos)
+{
+	long cut;
+	int i;
+
+	if (metadata.conntype != ACACHE_READWRITE_CONN)
+		return -EINVAL;
+
+	if (size > MAX_TRANSFER_SIZE)
+		size = MAX_TRANSFER_SIZE;
+
+	cut = copy_from_user(adev.writebuf, buf, size);
+	for (i = 0; i < (size - cut) / sizeof(struct acache_info); i++) {
+		if (queue_prefetch_item(adev.writebuf + i))
+			break;
+	}
+	return i * sizeof(struct acache_info);
+}
+
+static long acache_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case ACACHE_GET_METADATA:
+		return copy_to_user((struct acache_metadata __user *)arg,
+				    &metadata, sizeof(struct acache_metadata));
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations acache_fops = {
+	.owner = THIS_MODULE,
+	.read = acache_read,
+	.write = acache_write,
+	.open = acache_open,
+	.release = acache_release,
+	.unlocked_ioctl = acache_ioctl,
+};
+
+void save_circ_item(struct acache_info *data)
+{
+	unsigned long flags;
+	struct acache_circ *circ = adev.acache_info_circ;
+
+	spin_lock_irqsave(&circ->lock, flags);
+	if (CIRC_SPACE(circ->head, circ->tail, circ->size) >= 1) {
+		memcpy(&circ->data[circ->head], data, sizeof(struct acache_info));
+		circ->head = (circ->head + 1) & (circ->size - 1);
+	} else {
+		pr_debug("ringbuffer is full; discard new request.");
+	}
+	spin_unlock_irqrestore(&circ->lock, flags);
+}
+
+void init_acache_circ(struct acache_circ **circ, void *startaddr)
+{
+	*circ = (struct acache_circ *)startaddr;
+	(*circ)->head = 0;
+	(*circ)->tail = 0;
+	(*circ)->size = ACACHE_CIRC_SIZE;
+	spin_lock_init(&(*circ)->lock);
+}
+
+static void acache_free_mem(void)
+{
+	int i;
+
+	for (i = 0; i < ACACHE_NR_DEVS; i++)
+		vfree(adev.mem_regionp[i].data);
+
+	if (adev.readbuf) {
+		vfree(adev.readbuf);
+		adev.readbuf = NULL;
+	}
+	if (adev.writebuf) {
+		vfree(adev.writebuf);
+		adev.writebuf = NULL;
+	}
+
+	kfree(adev.prefetch_workers);
+	adev.prefetch_workers = NULL;
+}
+
+int acache_prefetch_init(struct acache_device *adev)
+{
+	int i;
+
+	if (acache_prefetch_workers <= 0) {
+		pr_err("acache_dev_size should not be less than zero");
+		return -1;
+	}
+	adev->prefetch_workers = kmalloc_array(acache_prefetch_workers,
+					       sizeof(struct prefetch_worker),
+					       GFP_KERNEL);
+	if (!adev->prefetch_workers)
+		goto fail_prefetch_workers_alloc;
+
+	INIT_LIST_HEAD(&adev->prefetch_workers_free);
+	spin_lock_init(&adev->prefetch_workers_free_list_lock);
+	for (i = 0; i < acache_prefetch_workers; i++) {
+		spin_lock(&adev->prefetch_workers_free_list_lock);
+		list_add_tail(&adev->prefetch_workers[i].list,
+			      &adev->prefetch_workers_free);
+		spin_unlock(&adev->prefetch_workers_free_list_lock);
+	}
+
+	adev->wq = alloc_workqueue("acache_prefetch", WQ_MEM_RECLAIM, 0);
+	if (!adev->wq)
+		goto fail_workqueue_alloc;
+
+	return 0;
+
+fail_workqueue_alloc:
+	kfree(adev->prefetch_workers);
+	adev->prefetch_workers = NULL;
+fail_prefetch_workers_alloc:
+	if (adev->wq)
+		destroy_workqueue(adev->wq);
+	return -1;
+}
+
+int acache_dev_init(void)
+{
+	int ret;
+	int i;
+	int major;
+	struct device *dev;
+
+	major = alloc_chrdev_region(&adev.devno, 0, ACACHE_NR_DEVS, DEV_NAME);
+	if (major < 0) {
+		pr_err("failed to allocate chrdev region: %d", major);
+		return major;
+		goto fail_allocdev;
+	}
+
+	adev.class = class_create(THIS_MODULE, DEV_NAME);
+	if (IS_ERR(adev.class)) {
+		pr_err("failed to create acache class");
+		ret = -1;
+		goto fail_class;
+	}
+
+	if (acache_dev_size < PAGE_SIZE) {
+		pr_err("acache_dev_size should not be less than PAGE_SIZE");
+		ret = -1;
+		goto fail_dev_add;
+	}
+	metadata.devsize = acache_dev_size;
+	metadata.magic = ACACHE_MAGIC;
+	metadata.conntype = ACACHE_READWRITE_CONN;
+	cdev_init(&adev.cdev, &acache_fops);
+	adev.cdev.owner = THIS_MODULE;
+
+	ret = cdev_add(&adev.cdev, adev.devno, ACACHE_NR_DEVS);
+	if (ret < 0) {
+		pr_err("failed to add cdev");
+		goto fail_dev_add;
+	}
+
+	dev = device_create(adev.class, NULL, adev.devno, NULL, DEV_NAME);
+	if (IS_ERR(dev)) {
+		pr_err("Could not create device");
+		ret = -1;
+		goto fail_device;
+	}
+
+	adev.readbuf = vmalloc(MAX_TRANSFER_SIZE);
+	adev.writebuf = vmalloc(MAX_TRANSFER_SIZE);
+	if (!adev.readbuf || !adev.writebuf) {
+		ret = -ENOMEM;
+		goto fail_malloc;
+	}
+
+	adev.initialized = true;
+	adev.mem_regionp =
+	    kmalloc_array(ACACHE_NR_DEVS, sizeof(struct mem_reg), GFP_KERNEL);
+	if (!adev.mem_regionp) {
+		ret = -ENOMEM;
+		goto fail_malloc;
+	}
+	memset(adev.mem_regionp, 0, sizeof(struct mem_reg) * ACACHE_NR_DEVS);
+
+	for (i = 0; i < ACACHE_NR_DEVS; i++) {
+		adev.mem_regionp[i].size = ACACHE_DEV_SIZE;
+		adev.mem_regionp[i].data = vmalloc(ACACHE_DEV_SIZE);
+		if (!adev.mem_regionp[i].data) {
+			ret = -ENOMEM;
+			goto fail_memregion_data_malloc;
+		}
+		memset(adev.mem_regionp[i].data, 0, ACACHE_DEV_SIZE);
+	}
+
+	init_acache_circ(&adev.acache_info_circ, adev.mem_regionp[0].data);
+	if (acache_prefetch_init(&adev))
+		goto fail_prefetch_init;
+
+	return 0;
+
+fail_prefetch_init:
+fail_memregion_data_malloc:
+	acache_free_mem();
+fail_malloc:
+	device_destroy(adev.class, adev.devno);
+fail_device:
+	cdev_del(&adev.cdev);
+fail_dev_add:
+	class_destroy(adev.class);
+fail_class:
+	unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS);
+fail_allocdev:
+	return ret;
+}
+
+void acache_dev_exit(void)
+{
+	if (!adev.initialized)
+		return;
+
+	if (adev.wq) {
+		flush_workqueue(adev.wq);
+		destroy_workqueue(adev.wq);
+	}
+	device_destroy(adev.class, adev.devno);
+	cdev_del(&adev.cdev);
+	acache_free_mem();
+	kfree(adev.mem_regionp);
+	unregister_chrdev_region(adev.devno, ACACHE_NR_DEVS);
+	class_destroy(adev.class);
+	kfree(adev.prefetch_workers);
+}
+
+struct cached_dev *get_cached_device_by_dev(dev_t dev)
+{
+	struct cache_set *c, *tc;
+	struct cached_dev *dc, *t;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			if (dc->bdev->bd_dev == dev && cached_dev_get(dc))
+				return dc;
+
+	return NULL;
+}
+
+struct bio *get_bio_by_item(struct cached_dev *dc, struct acache_info *item)
+{
+	struct bio *bio;
+	uint64_t offset = item->offset + dc->sb.data_offset;
+
+	if (get_capacity(dc->bdev->bd_disk) < offset + (item->length >> 9)) {
+		pr_err("prefetch area exceeds the capacity of disk(%d:%d), end: %llx, capacity: %lx",
+		    MAJOR(dc->bdev->bd_dev), MINOR(dc->bdev->bd_dev),
+		    offset + (item->length >> 9),
+		    get_capacity(dc->bdev->bd_disk));
+		return NULL;
+	}
+
+	bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), &dc->disk.bio_split);
+	if (!bio) {
+		bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(item->length >> 9, PAGE_SECTORS), NULL);
+		if (!bio)
+			return NULL;
+	}
+
+	bio_set_dev(bio, dc->bdev);
+	bio->bi_iter.bi_sector = item->offset + dc->sb.data_offset;
+	bio->bi_iter.bi_size = (item->length >> 9) << 9;
+
+	bch_bio_map(bio, NULL);
+	if (bch_bio_alloc_pages(bio, __GFP_NOWARN | GFP_NOIO))
+		goto out_put;
+
+	return bio;
+out_put:
+	bio_put(bio);
+	return NULL;
+}
+
+int process_one_request(struct acache_info *item)
+{
+	struct cached_dev *dc;
+	struct bio *cache_bio;
+	struct search *s;
+
+	dc = get_cached_device_by_dev(item->dev);
+	if (dc == NULL)
+		return -1;
+	cache_bio = get_bio_by_item(dc, item);
+	if (cache_bio == NULL) {
+		pr_err("acache: failed to alloc bio for prefetch");
+		goto put_dev;
+	}
+
+	s = search_alloc(cache_bio, &dc->disk, true);
+
+	trace_bcache_prefetch_request(&dc->disk, cache_bio);
+	generic_start_io_acct(cache_bio->bi_disk->queue,
+			      bio_op(cache_bio),
+			      bio_sectors(cache_bio),
+			      &s->d->disk->part0);
+
+	cached_dev_read(dc, s);
+	return 0;
+
+put_dev:
+	cached_dev_put(dc);
+	return -1;
+}
+
diff --git a/drivers/md/bcache/acache.h b/drivers/md/bcache/acache.h
new file mode 100644
index 000000000000..dea6e8cb0a05
--- /dev/null
+++ b/drivers/md/bcache/acache.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ACHACHE_INTERFACE_H_
+#define _ACHACHE_INTERFACE_H_
+
+#define ACACHE_NR_DEVS 1
+
+#define RING_SIZE
+
+#include "bcache.h"
+
+struct mem_reg {
+	char *data;
+	unsigned long size;
+};
+
+struct acache_info {
+	uint64_t length;
+	uint64_t offset;
+	uint64_t start_time;
+	dev_t dev;
+	int type;
+};
+
+enum acache_info_type {
+	ACACHE_INFO_READ = 0,
+	ACACHE_INFO_WRITE,
+	ACACHE_INFO_CACHE_INSERT,
+	ACACHE_INFO_LATENCY,
+};
+
+struct acache_circ {
+	spinlock_t lock;
+	int tail;
+	int head;
+	int size;
+	int item_size;
+	struct acache_info data[0];
+};
+
+struct acache_metadata {
+	uint32_t magic;
+	uint32_t conntype;
+	uint32_t devsize;
+};
+
+#define ACACHE_DEV_SIZE acache_dev_size
+#define ACACHE_MAGIC 2
+
+enum acache_conn_types {
+	ACACHE_NO_CONN = 0,
+	ACACHE_READWRITE_CONN = 2,
+};
+
+#define ACACHE_CIRC_SIZE \
+	({int i = (ACACHE_DEV_SIZE - sizeof(struct acache_circ))/sizeof(struct acache_info); \
+	int bits = 0; \
+	while (i > 0) {i >>= 1; bits++; } \
+	  1 << (bits - 1); })
+
+
+#define  ACACHE_GET_METADATA	_IOR('a', 1, struct acache_metadata)
+
+int acache_dev_init(void);
+void acache_dev_exit(void);
+struct acache_info *fetch_circ_item(struct acache_circ *circ);
+void save_circ_item(struct acache_info *data);
+
+#endif
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 756fc5425d9b..8a65a859bc48 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -176,8 +176,11 @@
  * - updates to non leaf nodes just happen synchronously (see btree_split()).
  */
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#endif
 #include <linux/bcache.h>
 #include <linux/bio.h>
 #include <linux/kobject.h>
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 8a075fac5d36..0dc71d561050 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2020,12 +2020,12 @@ static bool btree_insert_key(struct btree *b, struct bkey *k,
    BUG_ON(bkey_cmp(k, &b->key) > 0);
status = bch_btree_insert_key(&b->keys, k, replace_key);
+	trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+				      status);
    if (status != BTREE_INSERT_STATUS_NO_INSERT) {
    	bch_check_keys(&b->keys, "%u for %s", status,
    		       replace_key ? "replace" : "insert");
-		trace_bcache_btree_insert_key(b, k, replace_key != NULL,
-					      status);
    	return true;
    } else
    	return false;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 820d8402a1dc..55588d13255d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -8,11 +8,13 @@
  */
#include "bcache.h"
+#include "acache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 #include "writeback.h"
+#include <linux/time.h>
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
@@ -308,10 +310,18 @@ static void bch_data_insert_start(struct closure *cl)
 void bch_data_insert(struct closure *cl)
 {
    struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct acache_info msg;
trace_bcache_write(op->c, op->inode, op->bio,
    		   op->writeback, op->bypass);
+	msg.offset = op->bio->bi_iter.bi_sector;
+	msg.length = op->bio->bi_iter.bi_size;
+	msg.type = ACACHE_INFO_CACHE_INSERT;
+	msg.dev = bio_dev(op->bio);
+	msg.start_time = ktime_get_ns();
+	save_circ_item(&msg);
+
    bch_keylist_init(&op->insert_keys);
    bio_get(op->bio);
    bch_data_insert_start(cl);
@@ -460,27 +470,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
/* Cache lookup */
-struct search {
-	/* Stack frame for bio_complete */
-	struct closure		cl;
-
-	struct bbio		bio;
-	struct bio		*orig_bio;
-	struct bio		*cache_miss;
-	struct bcache_device	*d;
-
-	unsigned int		insert_bio_sectors;
-	unsigned int		recoverable:1;
-	unsigned int		write:1;
-	unsigned int		read_dirty_data:1;
-	unsigned int		cache_missed:1;
-
-	unsigned long		start_time;
-
-	struct btree_op		op;
-	struct data_insert_op	iop;
-};
-
 static void bch_cache_read_endio(struct bio *bio)
 {
    struct bbio *b = container_of(bio, struct bbio, bio);
@@ -539,6 +528,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
    	return MAP_CONTINUE;
/* XXX: figure out best pointer - for multiple cache devices */
+
    ptr = 0;
PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
@@ -556,6 +546,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
    bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
    bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
+	if (!s->prefetch) {
    n->bi_end_io	= bch_cache_read_endio;
    n->bi_private	= &s->cl;
@@ -571,6 +562,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
     */
__bch_submit_bbio(n, b->c);
+	} else {
+		bio_put(n);
+	}
    return n == bio ? MAP_DONE : MAP_CONTINUE;
 }
@@ -673,7 +667,12 @@ static void bio_complete(struct search *s)
trace_bcache_request_end(s->d, s->orig_bio);
    	s->orig_bio->bi_status = s->iop.status;
-		bio_endio(s->orig_bio);
+		if (s->prefetch) {
+			bio_free_pages(s->orig_bio);
+			bio_put(s->orig_bio);
+		} else {
+			bio_endio(s->orig_bio);
+		}
    	s->orig_bio = NULL;
    }
 }
@@ -698,7 +697,7 @@ static void do_bio_hook(struct search *s,
    bio_cnt_set(bio, 3);
 }
-static void search_free(struct closure *cl)
+void search_free(struct closure *cl)
 {
    struct search *s = container_of(cl, struct search, cl);
@@ -712,8 +711,8 @@ static void search_free(struct closure *cl)
    mempool_free(s, &s->iop.c->search);
 }
-static inline struct search *search_alloc(struct bio *bio,
-					  struct bcache_device *d)
+struct search *search_alloc(struct bio *bio,
+			    struct bcache_device *d, bool prefetch)
 {
    struct search *s;
@@ -731,6 +730,7 @@ static inline struct search *search_alloc(struct bio *bio,
    s->write		= op_is_write(bio_op(bio));
    s->read_dirty_data	= 0;
    s->start_time		= jiffies;
+	s->prefetch		= prefetch;
s->iop.c		= d->c;
    s->iop.bio		= NULL;
@@ -830,22 +830,27 @@ static void cached_dev_read_done(struct closure *cl)
    if (s->iop.bio) {
    	bio_reset(s->iop.bio);
    	s->iop.bio->bi_iter.bi_sector =
-			s->cache_miss->bi_iter.bi_sector;
+		    s->cache_miss->bi_iter.bi_sector;
    	bio_copy_dev(s->iop.bio, s->cache_miss);
    	s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
    	bch_bio_map(s->iop.bio, NULL);
-		bio_copy_data(s->cache_miss, s->iop.bio);
+		if (!s->prefetch)
+			bio_copy_data(s->cache_miss, s->iop.bio);
+		else
+			trace_bcache_prefetch_cache_miss(s->iop.bio);
bio_put(s->cache_miss);
    	s->cache_miss = NULL;
+
    }
if (verify(dc) && s->recoverable && !s->read_dirty_data)
    	bch_data_verify(dc, s->orig_bio);
closure_get(&dc->disk.cl);
-	bio_complete(s);
+	if (!s->prefetch)
+		bio_complete(s);
if (s->iop.bio &&
        !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
@@ -861,10 +866,19 @@ static void cached_dev_read_done_bh(struct closure *cl)
    struct search *s = container_of(cl, struct search, cl);
    struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-	bch_mark_cache_accounting(s->iop.c, s->d,
+	if (s->prefetch)
+		pr_debug("prefetch request; do not count cache_missed");
+	else
+		bch_mark_cache_accounting(s->iop.c, s->d,
    			  !s->cache_missed, s->iop.bypass);
    trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
+	if (!s->prefetch && !s->iop.status) {
+		s->smp.type = ACACHE_INFO_LATENCY;
+		s->smp.start_time = ktime_get_ns() - s->smp.start_time;
+		save_circ_item(&s->smp);
+	}
+
    if (s->iop.status)
    	continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
    else if (s->iop.bio || verify(dc))
@@ -890,8 +904,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
    }
if (!(bio->bi_opf & REQ_RAHEAD) &&
-	    !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
-	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+	    !(bio->bi_opf & REQ_META) &&
+	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA &&
+	    !s->prefetch)
    	reada = min_t(sector_t, dc->readahead >> 9,
    		      get_capacity(bio->bi_disk) - bio_end_sector(bio));
@@ -932,8 +947,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
    if (reada)
    	bch_mark_cache_readahead(s->iop.c, s->d);
-	s->cache_miss	= miss;
-	s->iop.bio	= cache_bio;
+	s->cache_miss = miss;
+	s->iop.bio = cache_bio;
    bio_get(cache_bio);
    /* I/O request sent to backing device */
    closure_bio_submit(s->iop.c, cache_bio, &s->cl);
@@ -942,14 +957,18 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 out_put:
    bio_put(cache_bio);
 out_submit:
-	miss->bi_end_io		= backing_request_endio;
-	miss->bi_private	= &s->cl;
-	/* I/O request sent to backing device */
-	closure_bio_submit(s->iop.c, miss, &s->cl);
+	if (!s->prefetch) {
+		miss->bi_end_io		= backing_request_endio;
+		miss->bi_private	= &s->cl;
+		/* I/O request sent to backing device */
+		closure_bio_submit(s->iop.c, miss, &s->cl);
+	} else {
+		bio_put(miss);
+	}
    return ret;
 }
-static void cached_dev_read(struct cached_dev *dc, struct search *s)
+void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
    struct closure *cl = &s->cl;
@@ -1196,11 +1215,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
    		      bio_sectors(bio),
    		      &d->disk->part0);
+
    bio_set_dev(bio, dc->bdev);
    bio->bi_iter.bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
-		s = search_alloc(bio, d);
+		s = search_alloc(bio, d, false);
    	trace_bcache_request_start(s->d, bio);
if (!bio->bi_iter.bi_size) {
@@ -1214,6 +1234,15 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
    	} else {
    		s->iop.bypass = check_should_bypass(dc, bio);
+			if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) {
+				s->smp.offset = bio->bi_iter.bi_sector - dc->sb.data_offset;
+				s->smp.length = bio->bi_iter.bi_size;
+				s->smp.type = rw;
+				s->smp.dev = dc->bdev->bd_dev;
+				s->smp.start_time = ktime_get_ns();
+				save_circ_item(&s->smp);
+			}
+
    		if (rw)
    			cached_dev_write(dc, s);
    		else
@@ -1316,7 +1345,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
-	s = search_alloc(bio, d);
+	s = search_alloc(bio, d, false);
    cl = &s->cl;
    bio = &s->bio.bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index c64dbd7a91aa..6366b8861974 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHE_REQUEST_H_
 #define _BCACHE_REQUEST_H_
+#include "btree.h"
+#include "acache.h"
struct data_insert_op {
    struct closure		cl;
@@ -41,4 +43,33 @@ void bch_flash_dev_request_init(struct bcache_device *d);
extern struct kmem_cache *bch_search_cache;
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	struct bbio		bio;
+	struct bio		*orig_bio;
+	struct bio		*cache_miss;
+	struct bcache_device	*d;
+
+	unsigned int		insert_bio_sectors;
+	unsigned int		recoverable:1;
+	unsigned int		write:1;
+	unsigned int		read_dirty_data:1;
+	unsigned int		cache_missed:1;
+
+	unsigned long		start_time;
+	/* for prefetch, we do not need copy data to bio */
+	bool			prefetch;
+	struct list_head	list_node;
+	wait_queue_head_t	wqh;
+	struct acache_info		smp;
+
+	struct btree_op		op;
+	struct data_insert_op	iop;
+};
+
+void search_free(struct closure *cl);
+struct search *search_alloc(struct bio *bio, struct bcache_device *d, bool prefetch);
+void cached_dev_read(struct cached_dev *dc, struct search *s);
 #endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 94045d72952c..b272f0c1ff3b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -8,6 +8,7 @@
  */
#include "bcache.h"
+#include "acache.h"
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
@@ -2625,6 +2626,7 @@ static void bcache_exit(void)
if (bcache_major)
    	unregister_blkdev(bcache_major, "bcache");
+	acache_dev_exit();
    unregister_reboot_notifier(&reboot);
    mutex_destroy(&bch_register_lock);
 }
@@ -2695,6 +2697,8 @@ static int __init bcache_init(void)
bch_debug_init();
    closure_debug_init();
+	if (acache_dev_init())
+		goto err;
bcache_is_reboot = false;
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e4526f85c19d..cb15af32291e 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -75,6 +75,12 @@ DECLARE_EVENT_CLASS(btree_node,
    TP_printk("bucket %zu", __entry->bucket)
 );
+/* readahead.c */
+DEFINE_EVENT(bcache_request, bcache_prefetch_request,
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
+);
+
 /* request.c */
DEFINE_EVENT(bcache_request, bcache_request_start,
@@ -120,6 +126,11 @@ DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
    TP_ARGS(bio)
 );
+DEFINE_EVENT(bcache_bio, bcache_prefetch_cache_miss,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
 TRACE_EVENT(bcache_read,
    TP_PROTO(struct bio *bio, bool hit, bool bypass),
    TP_ARGS(bio, hit, bypass),
-- 
2.25.1

    

2024

2023

2022

2021

2020

2019

[PATCH kernel-4.19 1/5] bcache: add a framework to perform prefetch